I'm using Scrapy to download images from http://www.vesselfinder.com/vessels
However, I can only get the relative url of images like this http://www.vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0
All of the image named 0.jpg but if I try to use that absolute url, I cannot get access to the image.
My code:
items.py
import scrapy
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class VesselPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
vessel_spider.py
import scrapy
import string
from vessel.items import VesselItem
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = str(sel.xpath('div[1]/a/picture/img/#src').extract())
item['image_urls'] = self.page_name + imageStr[3:-2]
nameStr = str(sel.xpath('div[2]/header/h1/a/text()').extract())
item['name'] = nameStr[19:-8]
typeStr = str(sel.xpath('div[2]/div[2]/div[2]/text()').extract())
item['type'] = typeStr[3:-2]
return item
When I run this spider, I got the exceptions.ValueError: Missing scheme in request url: h error because I did not provide the absolute url.
[vessel] ERROR: Error processing {'image_urls': 'http://vesselfinder.com/vessels/ship-photo/0-224138470-a2fdc783d05a019d00ad9db0cef322f7/0.jpg',
'name': 'XILGARO ALEANTE',
'type': 'Sailing vessel'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 383, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 491, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 578, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/media.py", line 40, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/images.py", line 104, in get_media_requests
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 26, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
How should I fix this. Is there any special way for getting image (or its absolute url) with the site like this one.
Wrap your image url in a list like so:
item['image_urls'] = [self.page_name + imageStr[3:-2]]
I think the following code will do the trick (very few changes to your code),
vessel_spider.py
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = sel.xpath('./div[#class="small-12 medium-5 large-5 columns"]/a/picture/img/#src').extract()
imageStr = imageStr[0] if imageStr else 'N/A'
item['image_urls'] = [self.page_name + imageStr]
nameStr = sel.xpath('./div/header/h1[#class="subheader"]/a/text()').extract()
nameStr = ' '.join(' '.join(nameStr).split()) if nameStr else 'N/A'
item['name'] = nameStr
typeStr = sel.xpath('.//div[#class="small-4 columns" and contains(text(), "Ship type")]/following-sibling::div/text()').extract()
typeStr = typeStr[0].strip() if typeStr else 'N/A'
item['ship_type'] = typeStr
yield item
items.py
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
ship_type = scrapy.Field()
Appending the sample output,
{'image_urls': u'http://vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0',
'name': u'IBTISAM ATAO',
'ship_type': u'Sailing vessel'}
Related
I'm trying to save image urls for individual properties in their respective csv files via feeds export, in order for this to work, the FEEDS csv_path in custom_settings will have to be changed every time a scrapy.Request is yielded in start_requests. Every time a scrapy.Request is yielded, the self.get_csv_path in __init__ is assigned a new csv file path correspondent to the property id, it is then fetched to FEEDS by def get_feeds_csv_path as in the code below. The self.feeds_csv_path in custom_settings doesn't seem to be able to access def get_feeds_csv_path, where is the error here?
import asyncio
from configparser import ConfigParser
import os
import pandas as pd
import scrapy
import requests
import json
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings = {
"FEEDS": {
self.feeds_csv_path: {
"format": "csv",
"overwrite": True
}
}
}
def __init__(self, *args, **kwargs):
self.feeds_csv_path = None
super(GetpropertyimgurlsSpider, self).__init__(*args, **kwargs)
def start_requests(self):
files = self.get_html_files() # List of html file full paths
for file in files[:2]:
self.feeds_csv_path = self.get_feeds_csv_path(file)
yield scrapy.Request(file, callback=self.parse)
def parse(self, response):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
#print(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
yield photo["contentUrl"]
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def get_path(self):
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files(self):
path = self.get_path()
dir = f"{path}/data/properties/"
dir_list = os.listdir(dir)
folders = []
for ins in dir_list:
if os.path.isdir(f"{dir}{ins}") == True:
append_ins = folders.append(ins)
html_files = []
for folder in folders:
html_file = f"{dir}{folder}/{folder}.html"
if os.path.isfile(html_file) == True:
append_html_file = html_files.append(f"file:///{html_file}")
return html_files
The first problem I see is that you are using the self keyword in the namespace scope of your spider class. The self keyword is only available inside of instance methods where you pass the keyword in as the first argument. e.g. def __init__(self...).
Even if self was available it still wouldn't work though, because once you create the custom_settings dictionary, the self.feeds_csv_path is immediately converted to it's string value at runtime, so updating the instance variable would have no effect on the custom_settings propert.
Another issue is that scrapy collects all of the custom settings and stores them internally before the crawl is actually started, and updating the custom_settings dictionary mid crawl might not actually have an effect. I am not certain about that though.
All of that being said, your goal is still achievable. One means that I can think of is by creating the FEEDS dictionary runtime but prior to initiating the crawl and filtering using custom scrapy.Item classes to filter which item belongs to which output.
I have no way of testing it so it might be buggy but here is an example of what I am referring to:
from configparser import ConfigParser
import json
import os
import scrapy
def get_path():
config = ConfigParser()
config.read("config.ini") # Location relative to main.py
path = config["scrapezoopla"]["path"]
return path
#Returns a list of html file dirs
def get_html_files():
path = get_path()
folder = f"{path}/data/properties/"
dir_list = os.listdir(folder)
html_files = []
for ins in dir_list:
if os.path.isdir(f"{folder}{ins}"):
if os.path.isfile(f"{folder}{ins}/{ins}.html"):
html_files.append(f"file:///{folder}{ins}/{ins}.html")
return html_files
def get_feeds_csv_path(self, html_file_path):
property_id = html_file_path.split("/")[-2].split("_")[1]
feeds_csv_path = f"{html_file_path}/images/Property_{property_id}_ImgSrcs.csv"
return feeds_csv_path
def create_custom_item():
class Item(scrapy.Item):
contentUrl = scrapy.Field()
return Item
def customize_settings():
feeds = {}
files = get_html_files()
start_urls = {}
for path in files:
custom_class = create_custom_item()
output_path = get_feeds_csv_path(path)
start_urls[path] = custom_class
feeds[output_path] = {
"format": "csv",
"item_classes": [custom_class],
}
custom_settings = {"FEEDS": feeds}
return custom_settings, start_urls
class GetpropertyimgurlsSpider(scrapy.Spider):
name = 'GetPropertyImgUrls'
custom_settings, start_urls = customize_settings()
def start_requests(self):
for uri, itemclass in self.start_urls.items():
yield scrapy.Request(uri, callback=self.parse, cb_kwargs={'itemclass': itemclass})
def parse(self, response, itemclass):
texts = response.xpath("//text()").getall()
text = texts[1]
json_text = json.loads(text)
photos = json_text["#graph"][3]["photo"]
for photo in photos:
item = itemclass()
item['contentUrl'] = photo["contentUrl"]
yield item
What I want to do is read every URL from a file and scrape this URL. After that, I will move the scraping data to the class WebRealTor and then serialize data in json and finally save all the data in a json file.
This is the content of the file:
https://www.seloger.com/annonces/achat/appartement/paris-14eme-75/montsouris-dareau/143580615.htm?ci=750114&idtt=2,5&idtypebien=2,1&LISTING-LISTpg=8&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/montpellier-34/gambetta/137987697.htm?ci=340172&idtt=2,5&idtypebien=1,2&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/montpellier-34/celleneuve/142626025.htm?ci=340172&idtt=2,5&idtypebien=1,2&naturebien=1,2,4&tri=initial&bd=ListToDetail
https://www.seloger.com/annonces/achat/appartement/versailles-78/domaine-national-du-chateau/138291887.htm
And my script is:
import scrapy
import json
class selogerSpider(scrapy.Spider):
name = "realtor"
custom_settings = {
'DOWNLOADER_MIDDLEWARES': {
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None,
'scrapy_fake_useragent.middleware.RandomUserAgentMiddleware': 400,
}
}
def start_requests(self):
with open("annonces.txt", "r") as file:
for line in file.readlines():
yield scrapy.Request(line)
def parse(self, response):
name = response.css(".agence-link::text").extract_first()
address = response.css(".agence-adresse::text").extract_first()
XPATH_siren = ".//div[#class='legalNoticeAgency']//p/text()"
siren = response.xpath(XPATH_siren).extract_first()
XPATH_website = ".//div[#class='agence-links']//a/#href"
site = response.xpath(XPATH_website).extract()
XPATH_phone = ".//div[#class='contact g-row-50']//div[#class='g-col g-50 u-pad-0']//button[#class='btn-phone b-btn b-second fi fi-phone tagClick']/#data-phone"
phone = response.xpath(XPATH_phone).extract_first()
yield {
'Agency_Name =': name,
'Agency_Address =': address,
'Agency_profile_website =': site,
'Agency_number =': phone,
'Agency_siren =': siren
}
file.close()
class WebRealTor:
def __name__(self):
self.nom = selogerSpider.name
def __address__(self):
self.adress = selogerSpider.address
def __sirenn__(self):
self.sire = selogerSpider.siren
def __numero__(self):
self.numero = selogerSpider.phone
with open('data.txt', 'w') as outfile:
json.dump(data, outfile)
Try to move everything to start_requests in you class. Like this:
def start_requests(self):
with open("annonces.txt", "r") as file:
for line in file.readlines():
yield scrapy.Request(line) # self.parse is by default
def parse(self, response):
# each link parsing as you already did
I've got a csv file with URL's and I need to scrape metadata from those website. I'm using python requests for that reasons with code below:
from tempfile import NamedTemporaryFile
import shutil
import csv
from bs4 import BeautifulSoup
import requests
import re
import html5lib
import sys
#import logging
filename = 'TestWTF.csv'
#logging.basicConfig(level=logging.DEBUG)
#Get filename (with extension) from terminal
#filename = sys.argv[1]
tempfile = NamedTemporaryFile(delete=False)
read_timeout = 1.0
#Does actual scraping done, returns metaTag data
def getMetadata (url, metaTag):
r = requests.get("http://" + url, timeout=2)
data = r.text
soup = BeautifulSoup(data, 'html5lib')
metadata = soup.findAll(attrs={"name":metaTag})
return metadata
#Gets either keyword or description
def addDescription ( row ):
scrapedKeywordsData = getMetadata(row, 'keywords')
if not scrapedKeywordsData:
print row + ' NO KEYWORDS'
scrapedKeywordsData = getMetadata(row, 'description')
if not scrapedKeywordsData:
return ''
return scrapedKeywordsData[0]
def prepareString ( data ):
output = data
#Get rid of opening meta content
if output.startswith( '<meta content="' ):
output = data[15:]
#Get rid of closing meta content (keywords)
if output.endswith( '" name="keywords"/>' ):
output = output[:-19]
#Get rid of closing meta content (description)
if output.endswith( '" name="description"/>' ):
output = output[:-22]
return output
def iterator():
with open(filename, 'rb') as csvFile, tempfile:
reader = csv.reader(csvFile, delimiter=',', quotechar='"')
writer = csv.writer(tempfile, delimiter=',', quotechar='"')
i = 0
for row in reader:
try:
data = str(addDescription (row[1] ))
row[3] = prepareString( data )
except requests.exceptions.RequestException as e:
print e
except requests.exceptions.Timeout as e:
print e
except requests.exceptions.ReadTimeout as e:
print "lol"
except requests.exceptions.ConnectionError as e:
print "These aren't the domains we're looking for."
except requests.exceptions.ConnectTimeout as e:
print "Too slow Mojo!"
writer.writerow(row)
i = i + 1
print i
shutil.move(tempfile.name, filename)
def main():
iterator()
#Defining main function
if __name__ == '__main__':
main()
It works just fine but at some URL's (out of 3000 let's say maybe 2-3) it would just suddenly stop working and not progress to next one after timeout time.. So I have to kill it using Ctr+C which results in file not being saved.
I know it's a problem of catching exceptions but I cannot figure out which one or what to do with that problem.. I'm more than happy to simply ignore the one which is stuck on..
EDIT:
Added traceback:
^CTraceback (most recent call last):
File "blacklist.py", line 90, in <module>
main()
File "blacklist.py", line 85, in main
iterator()
File "blacklist.py", line 62, in iterator
data = str(addDescription (row[1] ))
File "blacklist.py", line 30, in addDescription
scrapedKeywordsData = getMetadata(row, 'keywords')
File "blacklist.py", line 25, in getMetadata
metadata = soup.findAll(attrs={"name":metaTag})
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1259, in find_all
return self._find_all(name, attrs, text, limit, generator, **kwargs)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 537, in _find_all
found = strainer.search(i)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1654, in search
found = self.search_tag(markup)
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1626, in search_tag
if not self._matches(attr_value, match_against):
File "/Library/Python/2.7/site-packages/bs4/element.py", line 1696, in _matches
if isinstance(markup, Tag):
KeyboardInterrupt
EDIT 2:
Example website for which script doesn't work: miniusa.com
I'm running a Scrapy spider in python to scrape images from a website. One of the images fails to download (even if I try to download it regularly through the site) which is an internal error for the site. This is fine, I don't care about trying to get the image, I just want to skip over the image when it fails and move onto the other images, but I keep getting a 10054 error.
> Traceback (most recent call last): File
> "c:\python27\lib\site-packages\twisted\internet\defer.py", line 588,
> in _runCallbacks
> current.result = callback(current.result, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line 137,
> in parse_photo_page
> self.retrievePhoto(base_url_photo + url[0], url_text) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 49, in wrapped_f
> return Retrying(*dargs, **dkw).call(f, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 212, in call
> raise attempt.get() File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 247, in get
> six.reraise(self.value[0], self.value[1], self.value[2]) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 200, in call
> attempt = Attempt(fn(*args, **kwargs), attempt_number, False) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line
> 216, in retrievePhoto
> code.write(f.read()) File "c:\python27\lib\socket.py", line 355, in read
> data = self._sock.recv(rbufsize) File "c:\python27\lib\httplib.py", line 612, in read
> s = self.fp.read(amt) File "c:\python27\lib\socket.py", line 384, in read
> data = self._sock.recv(left) error: [Errno 10054] An existing connection was forcibly closed by the remote
Here is my parse function that looks at the photo page and finds the important url's:
def parse_photo_page(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('td/font/a/#href').extract()
table_fields = sel.xpath('td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, " ","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
Here is my download function with retry decorator:
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
It retries the download 5 times, but then throws the 10054 error and does not continue to the next image. How can I get the spider to continue after retrying? Again, I don't care about downloading the problem image, I just want to skip over it.
It's correct that you shouldn't use urllib inside scrapy because it blocks everything. Try to read resources related to "scrapy twisted" and "scrapy asynchronous". Anyway... I don't believe that your main problem is with "continue after retrying" but with not using "relevant xpaths" on your expressions. Here is a version that works for me (Note the ./ in './td/font/a/#href'):
import scrapy
import string
import urllib
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
def parse(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, " ","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
And here's a (much better) version that follows your patterns but uses ImagesPipeline that #paul trmbrth mentioned.
import scrapy
import string
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider2"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
custom_settings = {
"ITEM_PIPELINES": {'scrapy.pipelines.images.ImagesPipeline': 1},
"IMAGES_STORE": saveLocation
}
def parse(self, response):
image_urls = []
image_texts = []
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, " ","")
url_text = string.replace(url_text," ","")
image_urls.append(base_url_photo + url[0])
image_texts.append(url_text)
return {"image_urls": image_urls, "image_texts": image_texts}
The demo file I use is this:
$ cat index.html
<table id="tblData"><tr>
<td><font>hi foo <span /> <span /> green.jpg </font></td>
</tr><tr>
<td><font>hi foo <span /> <span /> blue.jpg </font></td>
</tr></table>
Pipelines.py
class DotabuffPipeline(object):
def open_spider(self, spider):
self.match_dict = {}
def process_item(self, item, spider):
ID = item['matchID']
if ID in self.match_dict:
self.match_dict[ID] = self.match_dict[ID] + 1
if self.match_dict[ID]==5:
return item
else:
self.match_dict[ID] = 1
firstspider.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
import json
from dotabuff.items import DotabuffItem
class DotaSpider(CrawlSpider):
name = "dotaspider"
allow_domains = ['www.dotabuff.com']
start_urls = []
with open('spiders/Steam.json','r') as f:
steam_data = json.load(f)
f.close
steam_members = steam_data['members']
for member in steam_members:
url = 'http://www.dotabuff.com/players/%s/matches?page=1' %str(member-76561197960265728)
start_urls.append(url)
rules = (Rule(LinkExtractor(allow=(r'http://www.dotabuff.com/players/\d+/matches\?page=\d+')), callback="parse_item", follow= True),)
def parse_item(self, response):
sel = Selector(response)
matches = sel.xpath('//td[#class="cell-large"]/a/#href').extract()
for match in matches:
item = DotabuffItem()
match = match.split('/')[-1]
item['matchID'] = match
yield item
I scrapy some match numbers from www.dotabuff.com, and i have five steam id in a json. I want to find out the matches we five played together. So i define a dict used as a counter to count the number of appearance.But it doesn't work.
Traceback (most recent call last):
File "e:\anaconda2\lib\site-packages\twisted\internet\defer.py", line 150, in
maybeDeferred
result = f(*args, **kw)
File "e:\anaconda2\lib\site-packages\scrapy\xlib\pydispatch\robustapply.py", l
ine 57, in robustApply
return receiver(*arguments, **named)
File "e:\anaconda2\lib\site-packages\scrapy\extensions\feedexport.py", line 19
3, in item_scraped
slot.exporter.export_item(item)
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 111, in export
_item
itemdict = dict(self._get_serialized_fields(item))
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 63, in _get_se
rialized_fields
field_iter = six.iterkeys(item)
File "e:\anaconda2\lib\site-packages\six.py", line 593, in iterkeys
return d.iterkeys(**kw)
AttributeError: 'NoneType' object has no attribute 'iterkeys'
Looking at the docs for pipelines in scrapy here, it says
This method is called for every item pipeline component and must
either return a dict with data, Item (or any descendant class) object
or raise a DropItem exception.
Your process_item method doesn't obey this rule and can return None, which is not iterable.