I have been putting together a piece of code, I found on stack overflow, on how to split url's output in separate csv files and I came up with the code below. However, I can no longer use field_to_export in the code. I wonder how can I set the field to export so that they are exported like: field_to_export = ['itemA', 'itemB', 'itemC'].
from scrapy import signals
from scrapy.exporters import CsvItemExporter
import re
class appPipeline(object):
urls = [l.strip() for l in open('data/listOfUrls.txt').readlines()]
names = [name.group(1) for l in urls for name in [re.search(r'https://www.google.co.uk/', l, re.M|re.I)] if name]
def __init__(self):
self.files = {}
self.exporters = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
self.files = dict([ (name, open('results/'+name+'.csv','w+b')) for name in self.names])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.names])
### this line is not working self.exportes.fields_to_export = ['itemA','itemB','itemC']
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
myItem = item['myItem']
if myItem in set(self.names):
self.exporters[myItem].export_item(item)
return item
So far I have been trying to override the keys in items. I have tried to serialize the items and I have been looking how to sort values in a dictionary by a list of keys. None of them worked.
Thanks for you help.
Related
I'm trying to save image urls for individual properties in their respective csv files via feeds export, in order for this to work, the FEEDS csv_path in custom_settings will have to be changed every time a scrapy.Request is yielded in start_requests. Every time a scrapy.Request is yielded, the self.get_csv_path in __init__ is assigned a new csv file path correspondent to the property id, it is then fetched to FEEDS by def get_feeds_csv_path as in the code below. The self.feeds_csv_path in custom_settings doesn't seem to be able to access def get_feeds_csv_path, where is the error here?
import asyncio
from configparser import ConfigParser
import os
import pandas as pd
import scrapy
import requests
import json
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings = {
"FEEDS": {
self.feeds_csv_path: {
"format": "csv",
"overwrite": True
}
}
}
def __init__(self, *args, **kwargs):
self.feeds_csv_path = None
super(GetpropertyimgurlsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
files = self.get_html_files() # List of html file full paths
for file in files[:2]:
self.feeds_csv_path = self.get_feeds_csv_path(file)
yield scrapy.Request(file, callback=self.parse)
def parse(self, response):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
#print(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
yield photo["contentUrl"]
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def get_path(self):
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files(self):
path = self.get_path()
dir = f"{path}/data/properties/"
dir_list = os.listdir(dir)
folders = []
for ins in dir_list:
if os.path.isdir(f"{dir}{ins}") == True:
append_ins = folders.append(ins)
html_files = []
for folder in folders:
html_file = f"{dir}{folder}/{folder}.html"
if os.path.isfile(html_file) == True:
append_html_file = html_files.append(f"file:///{html_file}")
return html_files
The first problem I see is that you are using the self keyword in the namespace scope of your spider class. The self keyword is only available inside of instance methods where you pass the keyword in as the first argument. e.g. def __init__(self...).
Even if self was available it still wouldn't work though, because once you create the custom_settings dictionary, the self.feeds_csv_path is immediately converted to it's string value at runtime, so updating the instance variable would have no effect on the custom_settings propert.
Another issue is that scrapy collects all of the custom settings and stores them internally before the crawl is actually started, and updating the custom_settings dictionary mid crawl might not actually have an effect. I am not certain about that though.
All of that being said, your goal is still achievable. One means that I can think of is by creating the FEEDS dictionary runtime but prior to initiating the crawl and filtering using custom scrapy.Item classes to filter which item belongs to which output.
I have no way of testing it so it might be buggy but here is an example of what I am referring to:
from configparser import ConfigParser
import json
import os
import scrapy
def get_path():
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files():
path = get_path()
folder = f"{path}/data/properties/"
dir_list = os.listdir(folder)
html_files = []
for ins in dir_list:
if os.path.isdir(f"{folder}{ins}"):
if os.path.isfile(f"{folder}{ins}/{ins}.html"):
html_files.append(f"file:///{folder}{ins}/{ins}.html")
return html_files
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def create_custom_item():
class Item(scrapy.Item):
contentUrl = scrapy.Field()
return Item
def customize_settings():
feeds = {}
files = get_html_files()
start_urls = {}
for path in files:
custom_class = create_custom_item()
output_path = get_feeds_csv_path(path)
start_urls[path] = custom_class
feeds[output_path] = {
"format": "csv",
"item_classes": [custom_class],
}
custom_settings = {"FEEDS": feeds}
return custom_settings, start_urls
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings, start_urls = customize_settings()
def start_requests(self):
for uri, itemclass in self.start_urls.items():
yield scrapy.Request(uri, callback=self.parse, cb_kwargs={'itemclass': itemclass})
def parse(self, response, itemclass):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
item = itemclass()
item['contentUrl'] = photo["contentUrl"]
yield item
I´d like to export data to several columns in csv but I always obtain this kind of file:
csv
I´d like to obtain two columns one "articulo" and another one "price"
My pipelines:
import scrapy
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
import csv
class MercadoPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemexporter(file)
self.exporter.fields_to_export = ['articulo','precio']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.closed()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Can you help me please?
Here you are:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['www.autodoc.es']
start_urls = ['https://www.autodoc.es/search?brandNo%5B0%5D=101']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//span[#class="next"]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//a[#class="ga-click"]')),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[2]/div[1]/div[1]/div/span[1]/span/text())').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[3]/div[2]/p[2]/text())').extract()
self.item_count += 1
if self.item_count > 20:
raise CloseSpider('item_exceeded')
yield ml_item
There is nothing wrong with the output of your code.
You are getting the two csv columns you want, but the program you are using to view the data is not interpreting it correctly.
By default, CsvItemExporter uses , as the delimiter, and the program seems to expect something else (and possibly even different quoting).
There are two possibilities to solve your problem:
Change the program's settings so it reads the file correctly
Change the way CsvItemExporter exports data (it will pass any additional keyword arguments to the underlying csv.writer object)
I am using scrapy to parse a table containing links and save it in json. The links from table contain additional detail and they will be fetched and stored into another json. (following this example: https://docs.scrapy.org/en/latest/topics/exporters.html)
To achieve this I am using a pipeline to check item type and store result in appropriate json. However, I am stuck in some weird error. Please refer below:
from scrapy import signals
from scrapy.exporters import JsonItemExporter
from for_icu import items
class ListPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print("spider_opened")
file_table = open('%s_table.json' % spider.name, 'w+b')
self.files[spider].append(file_table)
self.exporter1 = JsonItemExporter(file_table)
self.exporter1.start_exporting()
file_detail = open('%s_detail.json' % spider.name, 'w+b')
self.files[spider].append(file_detail)
self.exporter2 = JsonItemExporter(file_detail)
self.exporter2.start_exporting()
def spider_closed(self, spider):
print("spider_closed")
self.exporter1.finish_exporting()
self.exporter2.finish_exporting()
for file in self.files.pop(spider):
file.close()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.exporter1.export_item(item)
return item
elif isinstance(item, items.UniDetail):
self.exporter22.export_item(item)
return item
Error:
2017-12-27 11:41:15 [scrapy.core.scraper] ERROR: Error processing {'country': ('Finland',),
'country_code': ('fi ',),
'u_link': ('http://www.xxxxxxx.xxx/xxxxxxx/xxxx.htm',),
'u': (' pisto',)}
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/website_scrapy/for_icu/for_icu/pipelines.py", line 31, in process_item
self.exporter.export_item(item)
AttributeError: 'ListPipeline' object has no attribute 'exporter1'
Please let me know what I am missing here... being stuck at this from past couple of hours ...
I was unable to have exporter work, so I used simple filewriter for the task:
class ListPipeline(object):
unilist = []
unidetail = []
def close_spider(self, spider):
print("spider_closed")
file_table = open('%s_table.json' % spider.name, 'w')
line = json.dumps(self.unilist)
file_table.write(line)
file_table.close()
file_detail = open('%s_detail.json' % spider.name, 'w')
line = json.dumps(self.unidetail)
file_detail.write(line)
file_detail.close()
self.unilist.clear()
self.unidetail.clear()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.unilist.append(dict((item)))
return item
elif isinstance(item, items.UniDetail):
self.unidetail.append(dict((item)))
return item
This makes me achieve what I want but it would be better if one can use builtin exporters. If someone knows how to make it work, please update.
I'm trying out print out a CSV file after scraping using piplines but the formatting is a bit weird because instead of printing it top to bottom it is printing it all at once after scraping page 1 and then all of page 2 in one column. I have attached piplines.py and one line from csv output(quite large). So how do I make to print column wise instead all at once from one page
pipline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
class CSVPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemExporter(file)
self.exporter.fields_to_export = ['names','stars','subjects','reviews']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
and output.csv
names stars subjects
Vivek0388,NikhilVashisth,DocSharad,Abhimanyu_swarup,Suresh N,kaushalhkapadia,JyotiMallick,Nitin T,mhdMumbai,SunilTukrel(COLUMN 2) 5 of 5 stars,4 of 5 stars,1 of 5 stars,5 of 5 stars,3 of 5 stars,4 of 5 stars,5 of 5 stars,5 of 5 stars,4 of 5 stars,4 of 5 stars(COLUMN 3) Best Stay,Awesome View... Nice Experience!,Highly mismanaged and dishonest.,A Wonderful Experience,Good place with average front office,Honeymoon,Awesome Resort,Amazing,ooty's beauty!!,Good stay and food
It should look something like this
Vivek0388 5 of 5
NikhilVashisth 5 of 5
DocSharad 5 of 5
...so on
EDIT:
items = [{'reviews:':"",'subjects:':"",'names:':"",'stars:':""} for k in range(1000)]
if(sites and len(sites) > 0):
for site in sites:
i+=1
items[i]['names'] = item['names']
items[i]['stars'] = item['stars']
items[i]['subjects'] = item['subjects']
items[i]['reviews'] = item['reviews']
yield Request(url="http://tripadvisor.in" + site, callback=self.parse)
for k in range(1000):
yield items[k]
Figured it out, csv zip it and then for loop it through it and write row. This was MUCH less complicated once you read the docs.
import csv
import itertools
class CSVPipeline(object):
def __init__(self):
self.csvwriter = csv.writer(open('items.csv', 'wb'), delimiter=',')
self.csvwriter.writerow(['names','starts','subjects','reviews'])
def process_item(self, item, ampa):
rows = zip(item['names'],item['stars'],item['subjects'],item['reviews'])
for row in rows:
self.csvwriter.writerow(row)
return item
For my scrapy project I'm currently using the ImagesPipeline. The downloaded images are stored with a SHA1 hash of their URLs as the file names.
How can I store the files using my own custom file names instead?
What if my custom file name needs to contain another scraped field from the same item? e.g. use the item['desc'] and the filename for the image with item['image_url']. If I understand correctly, that would involve somehow accessing the other item fields from the Image Pipeline.
Any help will be appreciated.
This is just actualization of the answer for scrapy 0.24 (EDITED), where the image_key() is deprecated
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
#item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + response.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. I don't know, how to put it in one line :-)
for image in item['images']:
yield Request(image)
In scrapy 0.12 I solved something like this
class MyImagesPipeline(ImagesPipeline):
#Name download version
def image_key(self, url):
image_guid = url.split('/')[-1]
return 'full/%s.jpg' % (image_guid)
#Name thumbnail version
def thumb_key(self, url, thumb_id):
image_guid = thumb_id + url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
yield Request(item['images'])
I found my way in 2017,scrapy 1.1.3
def file_path(self, request, response=None, info=None):
return request.meta.get('filename','')
def get_media_requests(self, item, info):
img_url = item['img_url']
meta = {'filename': item['name']}
yield Request(url=img_url, meta=meta)
like the code above,you can add the name you want to a Request meta in get_media_requests(), and get it back in file_path() by request.meta.get('yourname','').
This was the way I solved the problem in Scrapy 0.10 .
Check the method persist_image of FSImagesStoreChangeableDirectory. The filename of the downloaded image is key
class FSImagesStoreChangeableDirectory(FSImagesStore):
def persist_image(self, key, image, buf, info,append_path):
absolute_path = self._get_filesystem_path(append_path+'/'+key)
self._mkdir(os.path.dirname(absolute_path), info)
image.save(absolute_path)
class ProjectPipeline(ImagesPipeline):
def __init__(self):
super(ImagesPipeline, self).__init__()
store_uri = settings.IMAGES_STORE
if not store_uri:
raise NotConfigured
self.store = FSImagesStoreChangeableDirectory(store_uri)
I did a nasty quick hack for that. In my case, I stored the title of image in my feeds. And, I had only 1 image_urls per item, so, I wrote the following script. It basically renames the image files in the /images/full/ directory with the corresponding title in the item feed that I had stored in as json.
import os
import json
img_dir = os.path.join(os.getcwd(), 'images\\full')
item_dir = os.path.join(os.getcwd(), 'data.json')
with open(item_dir, 'r') as item_json:
items = json.load(item_json)
for item in items:
if len(item['images']) > 0:
cur_file = item['images'][0]['path'].split('/')[-1]
cur_format = cur_file.split('.')[-1]
new_title = item['title']+'.%s'%cur_format
file_path = os.path.join(img_dir, cur_file)
os.rename(file_path, os.path.join(img_dir, new_title))
It's nasty & not recommended. But, it is a naive alternative approach.
I rewrite the code, changing, in thumb_path def, "response." by "request.". If no, it won't work because "response is set to None".
class MyImagesPipeline(ImagesPipeline):
#Name download version
def file_path(self, request, response=None, info=None):
#item=request.meta['item'] # Like this you can use all from item, not just url.
image_guid = request.url.split('/')[-1]
return 'full/%s' % (image_guid)
#Name thumbnail version
def thumb_path(self, request, thumb_id, response=None, info=None):
image_guid = thumb_id + request.url.split('/')[-1]
return 'thumbs/%s/%s.jpg' % (thumb_id, image_guid)
def get_media_requests(self, item, info):
#yield Request(item['images']) # Adding meta. Dunno how to put it in one line :-)
for image in item['images']:
yield Request(image)