I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method
Related
def parse(self, response):
category_names = []
category_urls = []
for item in response.css("#zg_browseRoot ul li"):
category_url = item.css("a").css(self.CSS_URL).extract()
category_name = item.css("a").css(self.CSS_TEXT).extract()
category_url = [
self.parse_url(category_url, 4) for category_url in category_url
]
(category_url,) = category_url
(category_name,) = category_name
category_names.append(category_name)
category_urls.append(category_url)
for c_name, url in zip(category_names, category_urls):
self.c_name = [c_name]
yield scrapy.Request(url, callback=self.parse_categories)
def parse_url(self, url, number):
parse = urlparse(url)
split = parse.path.split("/")[:number]
return f'{self.BASE_URL}{"/".join(split)}'
def parse_categories(self, response):
sub_names = []
sub_urls = []
for item in response.css("#zg_browseRoot ul ul li"):
sub_name = item.css("a").css(self.CSS_TEXT).extract()
sub_url = item.css("a").css(self.CSS_URL).extract()
sub_url = [self.parse_url(sub_url, 5) for sub_url in sub_url]
(sub_url,) = sub_url
(sub_name,) = sub_name
sub_names.append(sub_name)
sub_urls.append(sub_url)
for sub_name, url in zip(sub_names, sub_urls):
self.sub_name = [sub_name]
# print("{}: {}, {}".format(url, self.sub_name, self.c_name))
yield scrapy.Request(url, callback=self.parse_subcategories)
def parse_subcategories(self, response):
url = self.parse_url(response.request.url, 5)
print(f"{self.c_name}, {self.sub_name}, {url}")
Hello everyone,
I'm having an issue with my Scrapy approach. I'm trying to scrape page which has categories and subcategories in which are items. I want to include category and subcategory with each item scraped.
The problem is that the Scrapys callback function is asynchronous and zipping the URLs with names doesn't seem to work, because the for loop is processed first, URLs are stored in a generator and names are staying behind. Can anyone help me to work around this?
Thanks in advance,
Daniel.
You can pass arbitrary data along with the requests by using th cb_kwargs parameter. You can read about the details here.
Here is a simplified example:
def parse(self, response):
rows = response.xpath('//div[#id="some-element"]')
for row in rows:
request_url = row.xpath('a/#href').get()
category = row.xpath('a/text()').get()
yield Request(
url=request_url,
callback=self.parse_category,
cb_kwargs={'category': category}
)
def parse_category(self, response, category): # Notice category arg in the func
# Process here
yield item
The data inserted in cb_kwargs is passed as a keyword arg into the callback function, so the key in the dict must match the name of the argument in the method definiton.
cb_kwargs were introduced in Scrapy v1.7, if you are using an older version you should use the meta param. You can read about it here, notice that the use is slightly different.
My spide looks like this/;
class ScrapeMovies(scrapy.Spider):
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = loopitem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
yield item
# This part is responsible for scraping all of the pages on a start url commented out for convinience
# next_page=response.xpath('//div[#class="page-nav-btm"]/ul/li[last()]/a/#href').extract_first()
# if next_page is not None:
# next_page=response.urljoin(next_page)
# yield scrapy.Request(next_page, callback=self.parse)
What it does as of know it scrapes the table (see the starting url). I want it to then go the link (members name column) and then extract some informations from this link (link is e.g. https://www.trekearth.com/members/monareng/) and the return this as an item.
How should i approach this?
If anything is unclear please do not hesitate to ask for clarification.
EDIT:
nowy my code looks as follows (however still does not work):
class ScrapeMovies(scrapy.Spider):
name='final'
start_urls = [
'https://www.trekearth.com/members/page1.htm?sort_by=md'
]
def parse(self, response):
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
item['groups'] = response.xpath('//div[#class="groups-btm"]/ul/li/text()').extract_first()
return item
Use meta field to put item forward to next callback
def parse_page1(self, response):
item = MyItem(main_url=response.url)
request = scrapy.Request("http://www.example.com/some_page.html",
callback=self.parse_page2)
request.meta['item'] = item
return request
def parse_page2(self, response):
item = response.meta['item']
item['other_url'] = response.url
return item
UPD: to process all rows use a yield in your loop
for row in response.xpath('//table[#class="member-table"]//tr[position() > 1]'):
item = FinalItem()
website = row.xpath('./td[2]//a/#href/text()').extract_first()
item['name'] = row.xpath('./td[2]//a/text()').extract_first()
request = scrapy.Request(website,
callback=self.parse_page2)
request.meta['item'] = item
yield request
Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test
I am building a simple(ish) parser in Scrapy and I am blissfully ignorant when it comes to scrapy and Python :-) In the file item.py I have a definition of thisItem() which I assign to item in the code below. All worked rather swimmingly, parseusing a callback to get to parse_dir_content... But then I realized I needed to scrape an extra bit of data and created another function parse_other_content. How do I get what is already in item into parse_other_content?
import scrapy
from this-site.items import *
import re
import json
class DmozSpider(scrapy.Spider):
name = "ABB"
allowed_domains = ["this-site.com.au"]
start_urls = [
"https://www.this-site.com.au?page=1",
"https://www.this-site.com.au?page=2",
]
def parse(self, response):
for href in response.xpath('//h3/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
item['title'] = sel.xpath('text()').extract()
item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
item['propid'] = propID
item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
item['rate_detail'] = sel["this"][0]["that"]
yield item
I know I am missing something simple here, but I can't seem to figure it out.
Per the scrapy documentation (http://doc.scrapy.org/en/1.0/topics/request-response.html#topics-request-response-ref-request-callback-arguments):
In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You can use the Request.meta attribute for that.
In your case I would do something like this:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
request.meta['item'] = item
yield request
def parse_other_content(self, response):
item = response.meta['item']
# do something with the item
return item
According to Steve (see comments) you can also pass a dictionary of meta data as a keyword argument to the Request constructor like so:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content, meta={'item':item})
yield request
You can either allow item to be visible to parse_other_content() by changing it to self.item, or sending it as a parameter to the function. (The first one might be easier.)
For the first solution just add self. to any reference to the item variable. This makes it visible to the entire class.
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
self.item = thisItem()
self.item['title'] = sel.xpath('text()').extract()
self.item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
self.item['propid'] = propID
self.item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
self.item['rate_detail'] = sel["this"][0]["that"]
yield self.item
Suppose I have a Bookitem, I need to add information to it in both the parse phase and detail phase
def parse(self, response)
data = json.loads(response)
for book in data['result']:
item = BookItem();
item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self,response):
hxs = HtmlXPathSelector(response)
item['price'] = ......
#I want to continue the same book item as from the for loop above
Using the code as is would led to undefined item in the detail phase. How can I pass the item to the detail? detail(self,response,item) doesn't seem to work.
There is an argument named meta for Request:
yield Request(url, callback=self.detail, meta={'item': item})
then in function detail, access it this way:
item = response.meta['item']
See more details here about jobs topic.
iMom0's approach still works, but as of scrapy 1.7, the recommended approach is to pass user-defined information through cb_kwargs and leave meta for middlewares, extensions, etc:
def parse(self, response):
....
yield Request(url, callback=self.detail, cb_kwargs={'item': item})
def detail(self,response, item):
item['price'] = ......
You could also pass the individual key-values into the cb_kwargs argument and then only instantiate the BookItem instance in the final callback (detail in this case):
def parse(self, response)
data = json.loads(response)
for book in data['result']:
yield Request(url,
callback=self.detail,
cb_kwargs=dict(id_=book['id'],
url=book['url']))
def detail(self,response, id_, url):
hxs = HtmlXPathSelector(response)
item = BookItem()
item['id'] = id_
item['url'] = url
item['price'] = ......
You can define variable in init method:
class MySpider(BaseSpider):
...
def __init__(self):
self.item = None
def parse(self, response)
data = json.loads(response)
for book in data['result']:
self.item = BookItem();
self.item['id'] = book['id']
url = book['url']
yield Request(url, callback=self.detail)
def detail(self, response):
hxs = HtmlXPathSelector(response)
self.item['price'] = ....