How to realize a counter in scrapy pipeline? - python

Pipelines.py
class DotabuffPipeline(object):
def open_spider(self, spider):
self.match_dict = {}
def process_item(self, item, spider):
ID = item['matchID']
if ID in self.match_dict:
self.match_dict[ID] = self.match_dict[ID] + 1
if self.match_dict[ID]==5:
return item
else:
self.match_dict[ID] = 1
firstspider.py
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.linkextractors import LinkExtractor
import json
from dotabuff.items import DotabuffItem
class DotaSpider(CrawlSpider):
name = "dotaspider"
allow_domains = ['www.dotabuff.com']
start_urls = []
with open('spiders/Steam.json','r') as f:
steam_data = json.load(f)
f.close
steam_members = steam_data['members']
for member in steam_members:
url = 'http://www.dotabuff.com/players/%s/matches?page=1' %str(member-76561197960265728)
start_urls.append(url)
rules = (Rule(LinkExtractor(allow=(r'http://www.dotabuff.com/players/\d+/matches\?page=\d+')), callback="parse_item", follow= True),)
def parse_item(self, response):
sel = Selector(response)
matches = sel.xpath('//td[#class="cell-large"]/a/#href').extract()
for match in matches:
item = DotabuffItem()
match = match.split('/')[-1]
item['matchID'] = match
yield item
I scrapy some match numbers from www.dotabuff.com, and i have five steam id in a json. I want to find out the matches we five played together. So i define a dict used as a counter to count the number of appearance.But it doesn't work.
Traceback (most recent call last):
File "e:\anaconda2\lib\site-packages\twisted\internet\defer.py", line 150, in
maybeDeferred
result = f(*args, **kw)
File "e:\anaconda2\lib\site-packages\scrapy\xlib\pydispatch\robustapply.py", l
ine 57, in robustApply
return receiver(*arguments, **named)
File "e:\anaconda2\lib\site-packages\scrapy\extensions\feedexport.py", line 19
3, in item_scraped
slot.exporter.export_item(item)
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 111, in export
_item
itemdict = dict(self._get_serialized_fields(item))
File "e:\anaconda2\lib\site-packages\scrapy\exporters.py", line 63, in _get_se
rialized_fields
field_iter = six.iterkeys(item)
File "e:\anaconda2\lib\site-packages\six.py", line 593, in iterkeys
return d.iterkeys(**kw)
AttributeError: 'NoneType' object has no attribute 'iterkeys'

Looking at the docs for pipelines in scrapy here, it says
This method is called for every item pipeline component and must
either return a dict with data, Item (or any descendant class) object
or raise a DropItem exception.
Your process_item method doesn't obey this rule and can return None, which is not iterable.

Related

Scrapy CSV column export

I´d like to export data to several columns in csv but I always obtain this kind of file:
csv
I´d like to obtain two columns one "articulo" and another one "price"
My pipelines:
import scrapy
from scrapy import signals
from scrapy.contrib.exporter import CsvItemExporter
import csv
class MercadoPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_items.csv' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = CsvItemexporter(file)
self.exporter.fields_to_export = ['articulo','precio']
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.closed()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
Can you help me please?
Here you are:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.exceptions import CloseSpider
from mercado.items import MercadoItem
class MercadoSpider(CrawlSpider):
name = 'mercado'
item_count = 0
allowed_domain = ['www.autodoc.es']
start_urls = ['https://www.autodoc.es/search?brandNo%5B0%5D=101']
rules = {
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//span[#class="next"]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//a[#class="ga-click"]')),
callback = 'parse_item', follow = False)
}
def parse_item(self, response):
ml_item = MercadoItem()
#info de producto
ml_item['articulo'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[2]/div[1]/div[1]/div/span[1]/span/text())').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="content"]/div[4]/div[3]/div[2]/p[2]/text())').extract()
self.item_count += 1
if self.item_count > 20:
raise CloseSpider('item_exceeded')
yield ml_item
There is nothing wrong with the output of your code.
You are getting the two csv columns you want, but the program you are using to view the data is not interpreting it correctly.
By default, CsvItemExporter uses , as the delimiter, and the program seems to expect something else (and possibly even different quoting).
There are two possibilities to solve your problem:
Change the program's settings so it reads the file correctly
Change the way CsvItemExporter exports data (it will pass any additional keyword arguments to the underlying csv.writer object)

scrapy pipeline exporter object is not getting instantiated

I am using scrapy to parse a table containing links and save it in json. The links from table contain additional detail and they will be fetched and stored into another json. (following this example: https://docs.scrapy.org/en/latest/topics/exporters.html)
To achieve this I am using a pipeline to check item type and store result in appropriate json. However, I am stuck in some weird error. Please refer below:
from scrapy import signals
from scrapy.exporters import JsonItemExporter
from for_icu import items
class ListPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
print("spider_opened")
file_table = open('%s_table.json' % spider.name, 'w+b')
self.files[spider].append(file_table)
self.exporter1 = JsonItemExporter(file_table)
self.exporter1.start_exporting()
file_detail = open('%s_detail.json' % spider.name, 'w+b')
self.files[spider].append(file_detail)
self.exporter2 = JsonItemExporter(file_detail)
self.exporter2.start_exporting()
def spider_closed(self, spider):
print("spider_closed")
self.exporter1.finish_exporting()
self.exporter2.finish_exporting()
for file in self.files.pop(spider):
file.close()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.exporter1.export_item(item)
return item
elif isinstance(item, items.UniDetail):
self.exporter22.export_item(item)
return item
Error:
2017-12-27 11:41:15 [scrapy.core.scraper] ERROR: Error processing {'country': ('Finland',),
'country_code': ('fi ',),
'u_link': ('http://www.xxxxxxx.xxx/xxxxxxx/xxxx.htm',),
'u': (' pisto',)}
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/Users/website_scrapy/for_icu/for_icu/pipelines.py", line 31, in process_item
self.exporter.export_item(item)
AttributeError: 'ListPipeline' object has no attribute 'exporter1'
Please let me know what I am missing here... being stuck at this from past couple of hours ...
I was unable to have exporter work, so I used simple filewriter for the task:
class ListPipeline(object):
unilist = []
unidetail = []
def close_spider(self, spider):
print("spider_closed")
file_table = open('%s_table.json' % spider.name, 'w')
line = json.dumps(self.unilist)
file_table.write(line)
file_table.close()
file_detail = open('%s_detail.json' % spider.name, 'w')
line = json.dumps(self.unidetail)
file_detail.write(line)
file_detail.close()
self.unilist.clear()
self.unidetail.clear()
def process_item(self, item, spider):
print("process_item")
if isinstance(item, items.UniListItem):
self.unilist.append(dict((item)))
return item
elif isinstance(item, items.UniDetail):
self.unidetail.append(dict((item)))
return item
This makes me achieve what I want but it would be better if one can use builtin exporters. If someone knows how to make it work, please update.

Scrapy: Error 10054 after retrying image download

I'm running a Scrapy spider in python to scrape images from a website. One of the images fails to download (even if I try to download it regularly through the site) which is an internal error for the site. This is fine, I don't care about trying to get the image, I just want to skip over the image when it fails and move onto the other images, but I keep getting a 10054 error.
> Traceback (most recent call last): File
> "c:\python27\lib\site-packages\twisted\internet\defer.py", line 588,
> in _runCallbacks
> current.result = callback(current.result, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line 137,
> in parse_photo_page
> self.retrievePhoto(base_url_photo + url[0], url_text) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 49, in wrapped_f
> return Retrying(*dargs, **dkw).call(f, *args, **kw) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 212, in call
> raise attempt.get() File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 247, in get
> six.reraise(self.value[0], self.value[1], self.value[2]) File "C:\Python27\Scripts\nhtsa\nhtsa\retrying.py", line 200, in call
> attempt = Attempt(fn(*args, **kwargs), attempt_number, False) File "C:\Python27\Scripts\nhtsa\nhtsa\spiders\NHTSA_spider.py", line
> 216, in retrievePhoto
> code.write(f.read()) File "c:\python27\lib\socket.py", line 355, in read
> data = self._sock.recv(rbufsize) File "c:\python27\lib\httplib.py", line 612, in read
> s = self.fp.read(amt) File "c:\python27\lib\socket.py", line 384, in read
> data = self._sock.recv(left) error: [Errno 10054] An existing connection was forcibly closed by the remote
Here is my parse function that looks at the photo page and finds the important url's:
def parse_photo_page(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('td/font/a/#href').extract()
table_fields = sel.xpath('td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
Here is my download function with retry decorator:
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
It retries the download 5 times, but then throws the 10054 error and does not continue to the next image. How can I get the spider to continue after retrying? Again, I don't care about downloading the problem image, I just want to skip over it.
It's correct that you shouldn't use urllib inside scrapy because it blocks everything. Try to read resources related to "scrapy twisted" and "scrapy asynchronous". Anyway... I don't believe that your main problem is with "continue after retrying" but with not using "relevant xpaths" on your expressions. Here is a version that works for me (Note the ./ in './td/font/a/#href'):
import scrapy
import string
import urllib
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
def parse(self, response):
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
self.retrievePhoto(base_url_photo + url[0], url_text)
from retrying import retry
#retry(stop_max_attempt_number=5, wait_fixed=2000)
def retrievePhoto(self, url, filename):
fullPath = self.saveLocation + "/" + filename
urllib.urlretrieve(url, fullPath)
And here's a (much better) version that follows your patterns but uses ImagesPipeline that #paul trmbrth mentioned.
import scrapy
import string
import os
class MyspiderSpider(scrapy.Spider):
name = "myspider2"
start_urls = (
'file:index.html',
)
saveLocation = os.getcwd()
custom_settings = {
"ITEM_PIPELINES": {'scrapy.pipelines.images.ImagesPipeline': 1},
"IMAGES_STORE": saveLocation
}
def parse(self, response):
image_urls = []
image_texts = []
for sel in response.xpath('//table[#id="tblData"]/tr'):
url = sel.xpath('./td/font/a/#href').extract()
table_fields = sel.xpath('./td/font/text()').extract()
if url:
base_url_photo = "http://www-nrd.nhtsa.dot.gov/"
url_text = table_fields[3]
url_text = string.replace(url_text, "&nbsp","")
url_text = string.replace(url_text," ","")
image_urls.append(base_url_photo + url[0])
image_texts.append(url_text)
return {"image_urls": image_urls, "image_texts": image_texts}
The demo file I use is this:
$ cat index.html
<table id="tblData"><tr>
<td><font>hi foo <span /> <span /> green.jpg </font></td>
</tr><tr>
<td><font>hi foo <span /> <span /> blue.jpg </font></td>
</tr></table>

Cannot download image with relative URL Python Scrapy

I'm using Scrapy to download images from http://www.vesselfinder.com/vessels
However, I can only get the relative url of images like this http://www.vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0
All of the image named 0.jpg but if I try to use that absolute url, I cannot get access to the image.
My code:
items.py
import scrapy
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.contrib.pipeline.images import ImagesPipeline
from scrapy.exceptions import DropItem
class VesselPipeline(object):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
vessel_spider.py
import scrapy
import string
from vessel.items import VesselItem
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = str(sel.xpath('div[1]/a/picture/img/#src').extract())
item['image_urls'] = self.page_name + imageStr[3:-2]
nameStr = str(sel.xpath('div[2]/header/h1/a/text()').extract())
item['name'] = nameStr[19:-8]
typeStr = str(sel.xpath('div[2]/div[2]/div[2]/text()').extract())
item['type'] = typeStr[3:-2]
return item
When I run this spider, I got the exceptions.ValueError: Missing scheme in request url: h error because I did not provide the absolute url.
[vessel] ERROR: Error processing {'image_urls': 'http://vesselfinder.com/vessels/ship-photo/0-224138470-a2fdc783d05a019d00ad9db0cef322f7/0.jpg',
'name': 'XILGARO ALEANTE',
'type': 'Sailing vessel'}
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/scrapy/middleware.py", line 62, in _process_chain
return process_chain(self.methods[methodname], obj, *args)
File "/usr/local/lib/python2.7/dist-packages/scrapy/utils/defer.py", line 65, in process_chain
d.callback(input)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 383, in callback
self._startRunCallbacks(result)
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 491, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/local/lib/python2.7/dist-packages/twisted/internet/defer.py", line 578, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/media.py", line 40, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/pipeline/images.py", line 104, in get_media_requests
return [Request(x) for x in item.get(self.IMAGES_URLS_FIELD, [])]
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 26, in __init__
self._set_url(url)
File "/usr/local/lib/python2.7/dist-packages/scrapy/http/request/__init__.py", line 61, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
exceptions.ValueError: Missing scheme in request url: h
How should I fix this. Is there any special way for getting image (or its absolute url) with the site like this one.
Wrap your image url in a list like so:
item['image_urls'] = [self.page_name + imageStr[3:-2]]
I think the following code will do the trick (very few changes to your code),
vessel_spider.py
class VesselSpider(scrapy.Spider):
"""docstring for VesselSpider"""
name = "vessel"
allowed_domains = ["vesselfinder.com"]
page_name = "http://vesselfinder.com"
start_urls = [
# "http://vesselfinder.com/vessels?page=%d" %i for i in range(0,1000)
"http://vesselfinder.com/vessels"
]
def parse(self, response):
f = open('vessels.txt', 'a')
count = 0;
for sel in response.xpath('//div[#class="items"]/article'):
item = VesselItem()
imageStr = sel.xpath('./div[#class="small-12 medium-5 large-5 columns"]/a/picture/img/#src').extract()
imageStr = imageStr[0] if imageStr else 'N/A'
item['image_urls'] = [self.page_name + imageStr]
nameStr = sel.xpath('./div/header/h1[#class="subheader"]/a/text()').extract()
nameStr = ' '.join(' '.join(nameStr).split()) if nameStr else 'N/A'
item['name'] = nameStr
typeStr = sel.xpath('.//div[#class="small-4 columns" and contains(text(), "Ship type")]/following-sibling::div/text()').extract()
typeStr = typeStr[0].strip() if typeStr else 'N/A'
item['ship_type'] = typeStr
yield item
items.py
class VesselItem(scrapy.Item):
name = scrapy.Field()
nationality = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
ship_type = scrapy.Field()
Appending the sample output,
{'image_urls': u'http://vesselfinder.com/vessels/ship-photo/0-227349190-7c01e2b3a7a5078ea94fff9a0f862f8a/0',
'name': u'IBTISAM ATAO',
'ship_type': u'Sailing vessel'}

Podio file upload fails in Python

I'm trying to do an upload to podio but it fails.
Following extract :
c = api.OAuthClient(
podio_pw.client_id,
podio_pw.client_secret,
podio_pw.username,
podio_pw.password,
source = "dit is een test"
attributes = {
'filename' : 'test.txt',
'source' : source
}
filep =
c.transport.POST(url='/file/v2/',body=attributes,type='multipart/form-data')
This results always in the following error.
Traceback (most recent call last):
File "C:\Python34\libs\podio-py-master\attach_invoices.py", line 43, in <module>
filep = c.transport.POST(url='/file/v2/',body=attributes,type='multipart/form-data')
File "C:\Python34\libs\podio-py-master\pypodio2\transport.py", line 135, in __call__
body = "".join(body)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 376, in __next__
return next(self)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 352, in __next__
block = next(self.param_iter)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 245, in iter_encode
block = self.encode(boundary)
File "C:\Python34\libs\podio-py-master\pypodio2\encode.py", line 233, in encode
if re.search("^--%s$" % re.escape(boundary), value, re.M):
File "C:\Python34\lib\re.py", line 166, in search
return _compile(pattern, flags).search(string)
TypeError: can't use a string pattern on a bytes-like object
I know it has to do something with byte-encoding etc but I have no idea how to handle it. Even if I try to make that source a file, raw file or whatever, the POST fails.
This worked for me:
c = api.OAuthClient(
client_id,
client_secret,
username,
password,
)
filename = 'screenie.png'
filedata = open(filename, 'r')
"""Create a file from raw data"""
attributes = {'filename': filename,
'source': filedata}
file_upload = c.transport.POST(url='/file/v2/', body=attributes, type='multipart/form-data')
print(file_upload)
I lifted the code from here: https://github.com/podio/podio-py/blob/master/pypodio2/areas.py
To execute the file upload process in Python 3.*, You have to update two files in pypodio.
Step 1
Replace the file encode.py with the below script.
import urllib.request
import http.client
import mimetypes
import codecs
import uuid
import binascii
import io
import os
import sys
def multipart_encode(fields, files):
content_type, body = MultipartFormdataEncoder().encode(fields, files)
return body, content_type
class MultipartFormdataEncoder(object):
def __init__(self):
self.boundary = uuid.uuid4().hex
self.content_type = 'multipart/form-data; boundary={}'.format(self.boundary)
#classmethod
def u(cls, s):
if sys.hexversion < 0x03000000 and isinstance(s, str):
s = s.decode('utf-8')
if sys.hexversion >= 0x03000000 and isinstance(s, bytes):
s = s.decode('utf-8')
return s
def iter(self, fields, files):
"""
fields is a sequence of (name, value) elements for regular form fields.
files is a sequence of (name, filename, file-type) elements for data to be uploaded as files
Yield body's chunk as bytes
"""
encoder = codecs.getencoder('utf-8')
for (key, value) in fields:
key = self.u(key)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"\r\n').format(key))
yield encoder('\r\n')
if isinstance(value, int) or isinstance(value, float):
value = str(value)
yield encoder(self.u(value))
yield encoder('\r\n')
for (key, filename, fpath) in files:
key = self.u(key)
filename = self.u(filename)
yield encoder('--{}\r\n'.format(self.boundary))
yield encoder(self.u('Content-Disposition: form-data; name="{}"; filename="{}"\r\n').format(key, filename))
yield encoder(
'Content-Type: {}\r\n'.format(mimetypes.guess_type(filename)[0] or 'application/octet-stream'))
yield encoder('\r\n')
with open(fpath, 'rb') as fd:
buff = fd.read()
yield (buff, len(buff))
yield encoder('\r\n')
yield encoder('--{}--\r\n'.format(self.boundary))
def encode(self, fields, files):
body = io.BytesIO()
for chunk, chunk_len in self.iter(fields, files):
body.write(chunk)
return self.content_type, body.getvalue()
Code snippet from here
Step 2
Update transport.py, line no.186,
if kwargs['type'] == 'multipart/form-data':
fields = [('filename', kwargs['body']['filename'])]
files = [('source', kwargs['body']['filename'],kwargs['body']['source'])]
body, content_type = multipart_encode(fields,files)
headers.update({'Content-Type': content_type, })
else:
Returns "can only join an iterable error"
attributes={'filename': 'mx.txt', 'source': 'hello uyur92wyhfr ruptgpwyoer8t9u'}
try:
item = c.transport.POST(url=url,
body=attributes,
type='multipart/form-data')
except Exception as e:
print(e)

Categories