Scrapyd init error when running scrapy spider - python

I'm trying to deploy a crawler with four spiders. One of the spiders uses XMLFeedSpider and runs fine from the shell and scrapyd, but the others use BaseSpider and all give this error when run in scrapyd, but run fine from the shell
TypeError: init() got an unexpected keyword argument '_job'
From what I've read this points to a problem with the init function in my spiders, but I cannot seem to solve the problem. I don't need an init function and if I remove it completely I still get the error!
My Spider looks like this
from scrapy import log
from scrapy.spider import BaseSpider
from scrapy.selector import XmlXPathSelector
from betfeeds_master.items import Odds
# Parameters
MYGLOBAL = 39
class homeSpider(BaseSpider):
name = "home"
#con = None
allowed_domains = ["www.myhome.com"]
start_urls = [
"http://www.myhome.com/oddxml.aspx?lang=en&subscriber=mysubscriber",
]
def parse(self, response):
items = []
traceCompetition = ""
xxs = XmlXPathSelector(response)
oddsobjects = xxs.select("//OO[OddsType='3W' and Sport='Football']")
for oddsobject in oddsobjects:
item = Odds()
item['competition'] = ''.join(oddsobject.select('Tournament/text()').extract())
if traceCompetition != item['competition']:
log.msg('Processing %s' % (item['competition'])) #print item['competition']
traceCompetition = item['competition']
item['matchDate'] = ''.join(oddsobject.select('Date/text()').extract())
item['homeTeam'] = ''.join(oddsobject.select('OddsData/HomeTeam/text()').extract())
item['awayTeam'] = ''.join(oddsobject.select('OddsData/AwayTeam/text()').extract())
item['lastUpdated'] = ''
item['bookie'] = MYGLOBAL
item['home'] = ''.join(oddsobject.select('OddsData/HomeOdds/text()').extract())
item['draw'] = ''.join(oddsobject.select('OddsData/DrawOdds/text()').extract())
item['away'] = ''.join(oddsobject.select('OddsData/AwayOdds/text()').extract())
items.append(item)
return items
I can put an use an init function in to the spider, but I get exactly the same error.
def __init__(self, *args, **kwargs):
super(homeSpider, self).__init__(*args, **kwargs)
pass
Why is this happening and how do I solve it?

The good answer was given by alecx :
My init function was :
def __init__(self, domain_name):
In order to work within an egg for scrapyd, it should be :
def __init__(self, domain_name, **kwargs):
considering you pass domain_name as mandatory argument

Related

How To Keep/Export Field Items in Specific Order Per Spider Class Definition, Utilizing The Items Pipeline in Scrapy

I have a spider which exports data to different CSV files (per the names of the class definitions as defined in the spider class). However, I also wanted to keep the order of the fields in a specific order as they were being processed and exported into their different CSV files.
For example, this is my items.py:
import scrapy
class first_class_def_Item(scrapy.Item):
f1 = scrapy.Field() # f1 an arbitrary id used for both class definition items
f2 = scrapy.Field()
f3 = scrapy.Field()
class second_class_def_Item(scrapy.Item):
f1 = scrapy.Field()
f4 = scrapy.Field()
f5 = scrapy.Field()
f6 = scrapy.Field()
This is my pipelines.py:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# The CSV file names are used (imported) from the scrapy spider.
# For this example, I just want to keep "first_class_def.csv" without,
# the "_item", as in "first_class_def_Item.csv" as defined in the main scrapy spider
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
# For simplicity, I'm using the same class def names as found in the,
# main scrapy spider and as defined in the items.py
SaveTypes = ['first_class_def','second_class_def']
def __init__(self):
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.SaveTypes ])
self.exporters = dict([ (name,CsvItemExporter(self.files[name])) for name in self.SaveTypes ])
[e.start_exporting() for e in self.exporters.values()]
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.SaveTypes):
self.exporters[typesItem].export_item(item)
return item
And this is my spider.py:
import os
import scrapy
from itertools import zip_longest
from somesite.items import first_class_def_Item, second_class_def_Item
from csv import DictReader
path = os.path.join(os.path.expanduser('~'), 'user', 'somefolder', 'IDs.csv')
class SomeSiteSpider(scrapy.Spider):
name = 'somesite'
allowed_domains = ['somesite.com']
start_urls = ['https://somesite.com/login.aspx']
def parse(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'txtLogin$txtInput': 'User',
'txtPassword$txtInput': 'pass',
'btnLogin.x': '53',
'btnLogin.y': '33'},
callback=self.Tables)
def Tables(self, response):
with open(path) as rows:
for row in DictReader(rows):
id=row["id"]
yield scrapy.Request("https://somesite.com/page1.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.first_class_def)
yield scrapy.Request("https://somesite.com/page2.aspx",
meta={'mid': mid,
'form_control': some_form_control},
dont_filter = True,
callback=self.second_class_def)
def first_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_1)
def scrap_page_1(self, response):
items = first_class_def_Item()
field_1 = response.xpath('//*[#class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[#class="formatCurrency"][1]/text()').extract()
for a,b in zip(field_1,field_2):
items['f1'] = response.meta['id']
items['f2'] = a
items['f3'] = b
yield items
def second_class_def(self, response):
return scrapy.FormRequest.from_response(response,
formdata={'id': response.meta['id'],
'form_control': response.meta['some_form_control'],
'form_control_two': 'some_form_control_two',
'SearchControl$btnCreateReport': 'Create Report'},
meta={'id': response.meta['id']},
callback=self.scrap_page_2)
def scrap_page_2(self, response):
items = second_class_def_Item()
field_1 = response.xpath('//*[#class="formatText"][1]/text()').extract()
field_2 = response.xpath('//*[#class="formatCurrency"][1]/text()').extract()
field_3 = response.xpath('//*[#class="formatText"][3]/text()').extract()
for a,b,c in zip(field_1,field_2,field_3):
items['f1'] = response.meta['id']
items['f4'] = a
items['f5'] = b
items['f6'] = c
yield items
As the spider was processing and exporting data, I was looking for a way to keep the fields in the CSV generated files "first_class_def.csv" and "second_class_def.csv", exported in the same order as in the items.py:
f1,f2,f3
and
f1,f4,f5,f6
However, whenever I would crawl the spider, the fields within the CSV files were being exported in random order:
f2,f1,f3 and f5,f1,f4,f6
The solution is posted below!
Unfortunately, due to the way scrapy's Item is implemented, the information about the order of field definitions is not preserved.
If the order matters, the best you can do is define the order you want as a separate class variable, and use that in your pipeline. Passing the fields_to_export argument to CsvItemExporter would probably be simplest.
Here's a basic idea you can play around with:
# items.py
class Item1(scrapy.Item):
fields_to_export = ['fi', 'f2']
f1 = scrapy.Field()
f2 = scrapy.Field()
# pipelines.py
from project.items import Item1
class SomeSitePipeline(object):
save_types = {'item1': Item1}
def spider_opened(self, spider):
# (...)
self.exporters = dict(
(name, CsvItemExporter(self.files[name], fields_to_export=item_type.fields_to_export))
for name, item_type in self.save_types.items()
)
# (...)
Also, I just noticed you're using list comprehensions for side-effects, which is a bad idea, you should just use a normal loop instead.
This is the solution to my specific problem: export fields organized per the items class definition as defined in the items.py of a scrapy spider project.
So after tinkering with this problem and implementing #stranac's suggestion of getting rid of the list comprehension, I came up with the following solution, allowing to export all fields in order into their relative csv files:
from scrapy.exporters import CsvItemExporter
from scrapy import signals
from pydispatch import dispatcher
def item_type(item):
# just want "first_class_def.csv" not "first_class_def_Item.csv"
return type(item).__name__.replace('_Item','')
class SomeSitePipeline(object):
fileNamesCsv = ['first_class_def','second_class_def']
def __init__(self):
self.files = {}
self.exporters = {}
dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
dispatcher.connect(self.spider_closed, signal=signals.spider_closed)
def spider_opened(self, spider):
self.files = dict([ (name, open("/somefolder/"+name+'.csv','wb')) for name in self.fileNamesCsv ])
for name in self.fileNamesCsv:
self.exporters[name] = CsvItemExporter(self.files[name])
if name == 'first_class_def':
self.exporters[name].fields_to_export = ['f1','f2','f3']
self.exporters[name].start_exporting()
if name == 'second_class_def':
self.exporters[name].fields_to_export = ['f1','f4','f5','f6']
self.exporters[name].start_exporting()
def spider_closed(self, spider):
[e.finish_exporting() for e in self.exporters.values()]
[f.close() for f in self.files.values()]
def process_item(self, item, spider):
typesItem = item_type(item)
if typesItem in set(self.fileNamesCsv):
self.exporters[typesItem].export_item(item)
return item
Now, everything works as I originally intended to.

text substitution {} does not work at scrapinghub

I create a url with {} format to change the url on the fly.
It works totally fine on my PC.
But once I upload and run it from scrapinghub one(state) of the many substitutions(others work fine) does not work, it returns %7B%7D& in the url which is encoded curly braces.
Why does this happen? What do I miss when referencing State variable?
This is the url from my code:
def __init__(self):
self.state = 'AL'
self.zip = '35204'
self.tax_rate = 0
self.years = [2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017]
def parse_m(self, response):
r = json.loads(response.text)
models = r['models']
year = response.meta['year']
make = response.meta['make']
for model in models:
for milage in [40000,50000,60000,70000,80000,90000,100000]:
url = '****/vehicles/?year={}&make={}&model={}&state={}&mileage={}&zip={}'.format(year,make, model, self.state, milage, self.zip)
and this is the url i see in the log of scrapinghub:
***/vehicles/?year=2010&make=LOTUS&model=EXIGE%20S&state=%7B%7D&mileage=100000&zip=35204
This is not a scrapinghub issue. It has to be your code only. If I do below
>>> "state={}".format({})
'state={}'
This would end up being
state=%7B%7D
I would add
assert type(self.state) is str
to my code to ensure this situation doesn't happen and if it does then you get an AssertionError

Scrapy Pipeline not starting

I'm having problems with Scrapy pipelines.
EnricherPipeline is never starting. I put a debugger in the fist line of process_item and it never gets control.
JsonPipeline does start, but the first argument it receives is of type generator object process_item and not the MatchItem instance it should receive (when I disable the EnricherPipeline, JsonPipeline works as expected.
class MatchSpider(CrawlSpider):
def parse(self, response):
browser = Browser(browser='Chrome')
browser.get(response.url)
browser.find_element_by_xpath('//a[contains(text(), "{l}") and #title="{c}"]'.format(l=self.league, c=self.country)).click()
browser.find_element_by_xpath('//select[#id="seasons"]/option[text()="{s}"]'.format(s=self.season.replace('-', '/'))).click()
browser.find_element_by_xpath('//a[contains(text(), "Fixture")]').click()
page_matches = browser.find_elements_by_xpath('//*[contains(#class, "result-1 rc")]')
matches.extend([m.get_attribute('href') for m in page_matches]
for m in matches[:1]:
yield Request(m, callback=self.process_match, dont_filter=True)
def process_match(self, response):
match_item = MatchItem()
match_item['url'] = response.url
match_item['project'] = self.settings.get('BOT_NAME')
match_item['spider'] = self.name
match_item['server'] = socket.gethostname()
match_item['date'] = datetime.datetime.now()
return match_item
class EnricherPipeline:
def process_item(self, item, spider):
self.match = defaultdict(dict)
self.match['date'] = item['match']['startTime']
self.match['referee'] = item['match']['refereeName']
self.match['stadium'] = item['match']['venueName']
self.match['exp_mins'] = item['match']['expandedMinutes']
yield self.match
class JsonPipeline:
def process_item(self, item, scraper):
output_dir = 'data/matches/{league}/{season}'.format(league=scraper.league, season=scraper.season)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
file_name = "-".join([str(datetime.strptime(item['date'], '%Y-%m-%dT%H:%M:%S').date()),
item['home']['name'], item['away']['name']]) + '.json'
item_path = os.sep.join((output_dir, file_name))
with open(item_path, 'w') as f:
f.write(json.dumps(item))
ITEM_PIPELINES = {
'scrapers.whoscored.whoscored.pipelines.EnricherPipeline': 300,
'scrapers.whoscored.whoscored.pipelines.JsonPipeline': 800,
}
Ok, so the problem was that EnricherPipeline was yielding and not returning a result. After that it worked as expected, although I still don't understand why a debugger is not working in that first pipeline.
I write scrapy code like #FranGoitia, using return item in dict type, and it can go to pipeline well.
The true reason is:
Can't not yield any type not base on dict, the scrapy engine will not call pipeline.
Odds, I spend three days to found this...

Scrapy parse_node() takes exactly 3 arguments (2 given)

I'm facing some issues with the parse_node method in Scrapy:
class s1(scrapy.spiders.XMLFeedSpider):
name = "s1"
handle_httpstatus_list = [400,401,403,404,408,410,500,502,503,504]
allowed_domains = ["xxx"]
start_urls = ["xxx"]
main_url = start_urls[0]
jobs_list = []
tracker = SummaryTracker()
itertag = "miojob"
counter = 0
def parse_node(self, response, node):
if response.status in [400,401,403,404,408,410,500,502,503,504]:
time.sleep(60)
yield scrapy.Request(self.main_url, callback=self.parse_node, errback=self.err1, dont_filter = True)
else:
#Some Code #
yield scrapy.Request(self.main_url, callback=self.parse_node, errback=self.err1, dont_filter = True)
This is part of a scrapy-bot that recursively scrape the same page to extract the last ten items. Everything works except for the last scrapy.Request, because it gives me this error:
"parse_node() takes exactly 3 arguments (2 given)"
Instead if I use a simple Request(self.main_url) it works, but I can't use the errback because it needs a callback. I tried to pass additional arguments to parse_node like this:
yield scrapy.Request(self.main_url, callback=self.parse_node(arg1,arg2), errback=self.err1, dont_filter = True)
but it gives me an Assertion error, probably because the arguments are wrong?
Have you any idea on how to solve this? Passing the correct args to parse_node, in the way I can use also the errback callable.
try
def parse_node(self, response):
<yourcode>
I've resolved the issue by reading the source code here:
https://github.com/scrapy/scrapy/blob/master/scrapy/spiders/feed.py
The old Request now is:
yield scrapy.Request(self.main_url, callback=self.parse,errback=self.err1, dont_filter = True)
The tweak here is calling the parse method, instead of the parse_node because "parse" will pass the Selector(node) to parse_node.

Scrapy creating XML feed wraps content in "value" tags

I've had a bit of help on here by my code pretty much works. The only issue is that in the process of generating an XML, it wraps the content in "value" tags when I don't want it to. According to the doc's this is due to this:
Unless overriden in the :meth:serialize_field method, multi-valued
fields are exported by serializing each value inside a <value>
element. This is for convenience, as multi-valued fields are very
common.
This is my output:
<?xml version="1.0" encoding="UTF-8"?>
<items>
<item>
<body>
<value>Don't forget me this weekend!</value>
</body>
<to>
<value>Tove</value>
</to>
<who>
<value>Jani</value>
</who>
<heading>
<value>Reminder</value>
</heading>
</item>
</items>
What I send it to the XML exporter seems to be this, so I don't know why it think's it's multivalue?
{'body': [u"Don't forget me this weekend!"],
'heading': [u'Reminder'],
'to': [u'Tove'],
'who': [u'Jani']}
pipeline.py
from scrapy import signals
from scrapy.contrib.exporter import XmlItemExporter
class XmlExportPipeline(object):
def __init__(self):
self.files = {}
#classmethod
def from_crawler(cls, crawler):
pipeline = cls()
crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
return pipeline
def spider_opened(self, spider):
file = open('%s_products.xml' % spider.name, 'w+b')
self.files[spider] = file
self.exporter = XmlItemExporter(file)
self.exporter.start_exporting()
def spider_closed(self, spider):
self.exporter.finish_exporting()
file = self.files.pop(spider)
file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
spider.py
from scrapy.contrib.spiders import XMLFeedSpider
from crawler.items import CrawlerItem
class SiteSpider(XMLFeedSpider):
name = 'site'
allowed_domains = ['www.w3schools.com']
start_urls = ['http://www.w3schools.com/xml/note.xml']
itertag = 'note'
def parse_node(self, response, selector):
item = CrawlerItem()
item['to'] = selector.xpath('//to/text()').extract()
item['who'] = selector.xpath('//from/text()').extract()
item['heading'] = selector.xpath('//heading/text()').extract()
item['body'] = selector.xpath('//body/text()').extract()
return item
Any help would be really appreciated. I just want the same output without the redundant tags.
The extract() method will always return a list of values, even if there is only a single value as a result, for example: [4], [3,4,5] or None.
To avoid this, if you know there is only one value, you can select it like:
item['to'] = selector.xpath('//to/text()').extract()[0]
Note:
Be aware that this can result in an exception thrown in case extract() returns None and you are trying to index that. In such uncertain cases, this is a good trick to use:
item['to'] = (selector.xpath('...').extract() or [''])[0]
Or you could write your custom function to get the first element:
def extract_first(selector, default=None):
val = selector.extract()
return val[0] if val else default
This way you can have a default value in case your desired value is not found:
item['to'] = extract_first(selector.xpath(...)) # First or none
item['to'] = extract_first(selector.xpath(...), 'not-found') # First of 'not-found'
The above answer is correct regarding why this is happening, but I'd like to add that there is now out of the box support for this, and no need to write a helper method.
item['to'] = selector.xpath('//to/text()').extract_first()
and
item['to'] = selector.xpath('//to/text()').extract_first(default='spam')

Categories