My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request
Related
I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method
CODE
spider.py
...
def parse(self, response):
for one_item in response.xpath('path1'):
item = ProjectItem()
request = scrapy.Request(one_item.xpath('path2'), callback=self.parse2)
request.meta['item'] = item
yield request
property = []
def parse2(self, response)
item = response.meta['item']
for x in response.xpath('path3')
self.property.append('path4')
next_page = response.xpath('path5')
if next_page is not None:
request2 = scrapy.Request(next_page, callback=self.parse2)
request2.meta['item'] = item
yield request2
else:
item['field'] = self.property
self.property = []
yield item
Problem is that when spider crawl to next_page. Some self.property is assign to wrong items. I don't know how to repair it.
self.property is a class attribute that is shared among all calls to parse2 and you can't control the order of each call to parse2 .
To solve that you need to pass the property list inside the meta or as a item attribute:
def parse(self, response):
for one_item in response.xpath('path1'):
item = ProjectItem()
item['field'] = []
request = scrapy.Request(one_item.xpath('path2'), callback=self.parse2)
request.meta['item'] = item
yield request
def parse2(self, response)
item = response.meta['item']
for x in response.xpath('path3')
item['field'].append('path4')
next_page = response.xpath('path5')
if next_page is not None:
request2 = scrapy.Request(next_page, callback=self.parse2)
request2.meta['item'] = item
yield request2
else:
yield item
I need to make 2 request to different urls and put that information to the same item. I have tried this method, but the result is written in different rows. The callbacks returns item. I have tried many methods but none seems to work.
def parse_companies(self, response):
data = json.loads(response.body)
if data:
item = ThalamusItem()
for company in data:
comp_id = company["id"]
url = self.request_details_URL + str(comp_id) + ".json"
request = Request(url, callback=self.parse_company_details)
request.meta['item'] = item
yield request
url2 = self.request_contacts + str(comp_id)
yield Request(url2, callback=self.parse_company_contacts, meta={'item': item})
Since scrapy is asynchronious you need to chain your requests manually. For transfering data between requests you can use Request's meta attribute:
def parse(self, response):
item = dict()
item['name'] = 'foobar'
yield request('http://someurl.com', self.parse2,
meta={'item': item})
def parse2(self, response):
print(response.meta['item'])
# {'name': 'foobar'}
In your case you end up with a split chain when you should have one continuous chain.
Your code should look something like this:
def parse_companies(self, response):
data = json.loads(response.body)
if not data:
return
for company in data:
item = ThalamusItem()
comp_id = company["id"]
url = self.request_details_URL + str(comp_id) + ".json"
url2 = self.request_contacts + str(comp_id)
request = Request(url, callback=self.parse_details,
meta={'url2': url2, 'item': item})
yield request
def parse_details(self, response):
item = response.meta['item']
url2 = response.meta['url2']
item['details'] = '' # add details
yield Request(url2, callback=self.parse_contacts, meta={'item': item})
def parse_contacts(self, response):
item = response.meta['item']
item['contacts'] = '' # add details
yield item
I am building a simple(ish) parser in Scrapy and I am blissfully ignorant when it comes to scrapy and Python :-) In the file item.py I have a definition of thisItem() which I assign to item in the code below. All worked rather swimmingly, parseusing a callback to get to parse_dir_content... But then I realized I needed to scrape an extra bit of data and created another function parse_other_content. How do I get what is already in item into parse_other_content?
import scrapy
from this-site.items import *
import re
import json
class DmozSpider(scrapy.Spider):
name = "ABB"
allowed_domains = ["this-site.com.au"]
start_urls = [
"https://www.this-site.com.au?page=1",
"https://www.this-site.com.au?page=2",
]
def parse(self, response):
for href in response.xpath('//h3/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
item['title'] = sel.xpath('text()').extract()
item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
item['propid'] = propID
item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
item['rate_detail'] = sel["this"][0]["that"]
yield item
I know I am missing something simple here, but I can't seem to figure it out.
Per the scrapy documentation (http://doc.scrapy.org/en/1.0/topics/request-response.html#topics-request-response-ref-request-callback-arguments):
In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You can use the Request.meta attribute for that.
In your case I would do something like this:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
request.meta['item'] = item
yield request
def parse_other_content(self, response):
item = response.meta['item']
# do something with the item
return item
According to Steve (see comments) you can also pass a dictionary of meta data as a keyword argument to the Request constructor like so:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content, meta={'item':item})
yield request
You can either allow item to be visible to parse_other_content() by changing it to self.item, or sending it as a parameter to the function. (The first one might be easier.)
For the first solution just add self. to any reference to the item variable. This makes it visible to the entire class.
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
self.item = thisItem()
self.item['title'] = sel.xpath('text()').extract()
self.item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
self.item['propid'] = propID
self.item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
self.item['rate_detail'] = sel["this"][0]["that"]
yield self.item
Suppose I have a Bookitem, I need to add information to it in both the parse phase and detail phase
def parse(self, response)
data = json.loads(response)
for book in data['result']:
item = BookItem();
item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self,response):
hxs = HtmlXPathSelector(response)
item['price'] = ......
#I want to continue the same book item as from the for loop above
Using the code as is would led to undefined item in the detail phase. How can I pass the item to the detail? detail(self,response,item) doesn't seem to work.
There is an argument named meta for Request:
yield Request(url, callback=self.detail, meta={'item': item})
then in function detail, access it this way:
item = response.meta['item']
See more details here about jobs topic.
iMom0's approach still works, but as of scrapy 1.7, the recommended approach is to pass user-defined information through cb_kwargs and leave meta for middlewares, extensions, etc:
def parse(self, response):
....
yield Request(url, callback=self.detail, cb_kwargs={'item': item})
def detail(self,response, item):
item['price'] = ......
You could also pass the individual key-values into the cb_kwargs argument and then only instantiate the BookItem instance in the final callback (detail in this case):
def parse(self, response)
data = json.loads(response)
for book in data['result']:
yield Request(url,
callback=self.detail,
cb_kwargs=dict(id_=book['id'],
url=book['url']))
def detail(self,response, id_, url):
hxs = HtmlXPathSelector(response)
item = BookItem()
item['id'] = id_
item['url'] = url
item['price'] = ......
You can define variable in init method:
class MySpider(BaseSpider):
...
def __init__(self):
self.item = None
def parse(self, response)
data = json.loads(response)
for book in data['result']:
self.item = BookItem();
self.item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self, response):
hxs = HtmlXPathSelector(response)
self.item['price'] = ....