def parse(self, response):
category_names = []
category_urls = []
for item in response.css("#zg_browseRoot ul li"):
category_url = item.css("a").css(self.CSS_URL).extract()
category_name = item.css("a").css(self.CSS_TEXT).extract()
category_url = [
self.parse_url(category_url, 4) for category_url in category_url
]
(category_url,) = category_url
(category_name,) = category_name
category_names.append(category_name)
category_urls.append(category_url)
for c_name, url in zip(category_names, category_urls):
self.c_name = [c_name]
yield scrapy.Request(url, callback=self.parse_categories)
def parse_url(self, url, number):
parse = urlparse(url)
split = parse.path.split("/")[:number]
return f'{self.BASE_URL}{"/".join(split)}'
def parse_categories(self, response):
sub_names = []
sub_urls = []
for item in response.css("#zg_browseRoot ul ul li"):
sub_name = item.css("a").css(self.CSS_TEXT).extract()
sub_url = item.css("a").css(self.CSS_URL).extract()
sub_url = [self.parse_url(sub_url, 5) for sub_url in sub_url]
(sub_url,) = sub_url
(sub_name,) = sub_name
sub_names.append(sub_name)
sub_urls.append(sub_url)
for sub_name, url in zip(sub_names, sub_urls):
self.sub_name = [sub_name]
# print("{}: {}, {}".format(url, self.sub_name, self.c_name))
yield scrapy.Request(url, callback=self.parse_subcategories)
def parse_subcategories(self, response):
url = self.parse_url(response.request.url, 5)
print(f"{self.c_name}, {self.sub_name}, {url}")
Hello everyone,
I'm having an issue with my Scrapy approach. I'm trying to scrape page which has categories and subcategories in which are items. I want to include category and subcategory with each item scraped.
The problem is that the Scrapys callback function is asynchronous and zipping the URLs with names doesn't seem to work, because the for loop is processed first, URLs are stored in a generator and names are staying behind. Can anyone help me to work around this?
Thanks in advance,
Daniel.
You can pass arbitrary data along with the requests by using th cb_kwargs parameter. You can read about the details here.
Here is a simplified example:
def parse(self, response):
rows = response.xpath('//div[#id="some-element"]')
for row in rows:
request_url = row.xpath('a/#href').get()
category = row.xpath('a/text()').get()
yield Request(
url=request_url,
callback=self.parse_category,
cb_kwargs={'category': category}
)
def parse_category(self, response, category): # Notice category arg in the func
# Process here
yield item
The data inserted in cb_kwargs is passed as a keyword arg into the callback function, so the key in the dict must match the name of the argument in the method definiton.
cb_kwargs were introduced in Scrapy v1.7, if you are using an older version you should use the meta param. You can read about it here, notice that the use is slightly different.
Related
Goal: I want to retrieve order performance data published on a specific e-commerce site. Since those data are spread across multiple pages for each order performance, we would like to extract the information for each page and finally summarize them as a single item or record.
I have looked through the official documentation and other similar Q.A.'s and found some.
From the information, I was able to get an idea that it might be possible to achieve this by using cb_kwargs.
However, I could not understand what is wrong with the following code.
[python - Interpreting callbacks and cb_kwargs with scrapy - Stack
Overflow]
(Interpreting callbacks and cb_kwargs with scrapy)
[python - Multiple pages per item in Scrapy.
(https://stackoverflow.com/questions/22201876/multiple-pages-per-item-in-scrapy?noredirect=1&lq=1)
The program runs, but the csv outputs nothing as shown in the image below.
enter image description here
The order results page contains information on 30 items per page.
I would like to first retrieve all of the signup dates for each, which are listed only on the first page, then move to each product page from there to retrieve the details, and then store those information one item at a time.
I am a beginner who started writing code in Python 3 months ago.
So I may have some problems with basic understanding about classes, etc.
I would appreciate it if you could point this out to me during our discussion.
The official documentation of scrapy is too unfriendly for beginners and I am having a hard time with it.
def parse_firstpage_item(self, response):
request = scrapy.Request(
url=response.url,
callback=self.parse_productpage_item,
cb_kwargs=dict(product_URL='//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a'))
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("Conversion_date", '//*[#id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
yield loader.load_item()
def parse_productpage_item(self, response, product_URL):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[#id="s_brand"]/dd/a/text())')
loader.add_value("page_URL" , response.url)
loader.add_xpath("inquire" , '//*[#id="tabmenu_inqcnt"]/text()')
yield loader.load_item()
class MyLinkExtractor(LinkExtractor):
def extract_links(self, response):
base_url = get_base_url(response)
if self.restrict_xpaths:
docs = [
subdoc
for x in self.restrict_xpaths
for subdoc in response.xpath(x)
]
else:
docs = [response.selector]
all_links = []
for doc in docs:
links = self._extract_links(doc, response.url, response.encoding, base_url)
all_links.extend(self._process_links(links))
logging.info('='*100)
logging.info(all_links)
logging.info(f'total liks len: {len(all_links)}')
logging.info('='*100)
return all_links
class AllSaledataSpider(CrawlSpider):
name = 'all_salesdata'
allowed_domains = ['www.buyma.com']
# start_urls = ['https://www.buyma.com/buyer/9887867/sales_1.html']
rules = (
Rule(MyLinkExtractor(
restrict_xpaths='//*[#class="buyeritem_name"]/a'), callback='parse_firstpage_item', follow=False),
Rule(LinkExtractor(restrict_xpaths='//DIV[#class="pager"]/DIV/A[contains(text(),"次")]'),follow=False)
)
def _requests_to_follow(self, response):
if not isinstance(response, HtmlResponse):
return
seen = set()
for rule_index, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)]
#if lnk not in seen]
for link in rule.process_links(links):
seen.add(link)
request = self._build_request(rule_index, link)
yield rule.process_request(request, response)
def start_requests(self):
with open('/Users/morni/buyma_researchtool/buyma_researchtool/AllshoppersURL.csv', 'r', encoding='utf-8') as f:
reader = csv.reader(f)
header = next(reader)
for row in reader:
yield scrapy.Request(url = str(row[2])[:-5]+'/sales_1.html')
for row in self.reader:
for n in range(1, 300):
url = f'{self.base_page}{row}/sales_{n}.html'
yield scrapy.Request(
url=url,
callback=self.parse_firstpage_item,
errback=self.errback_httpbin,
dont_filter=True
)
def parse_firstpage_item(self, response):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("Conversion_date", '//*[#id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
loader.add_xpath("product_name" , '//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/text()')
loader.add_value("product_URL" , '//*[#id="buyeritemtable"]/div/ul/li[2]/p[1]/a/#href')
item = loader.load_item()
yield scrapy.Request(
url=response.urljoin(item['product_URL']),
callback=self.parse_productpage_item,
cb_kwargs={'item': item},
)
def parse_productpage_item(self, response, item):
loader = ItemLoader(item=item, response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[#id="s_brand"]/dd/a/text())')
〜
yield loader.load_item()
You need to call each page and pass your current item to callback:
def parse_first_page(self, response):
loader = ItemLoader(item = BuymaResearchtoolItem(), response = response)
loader.add_xpath("brand_name", 'normalize-space(//*[#id="s_brand"]/dd/a/text())')
loader.add_value("page_URL" , response.url)
loader.add_xpath("inquire" , '//*[#id="tabmenu_inqcnt"]/text()')
item = loader.load_item()
yield scrapy. Request(
url=second_page_url,
callback=self.parse_second_page,
cb_kwargs={'item': item},
)
def parse_second_page(self, response, item):
loader = ItemLoader(item=item, response=response)
loader.add_xpath("Conversion_date", '//*[#id="buyeritemtable"]/div/ul/li[2]/p[3]/text()')
item = loader.load_item()
yield scrapy. Request(
url=third_page_url,
callback=self.parse_third_page,
cb_kwargs={'item': item},
)
def parse_third_page(self, response, item):
loader = ItemLoader(item=item, response=response)
loader.add_value('ThirdUrl', response.url)
yield loader.load_item()
I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method
Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test
I am building a simple(ish) parser in Scrapy and I am blissfully ignorant when it comes to scrapy and Python :-) In the file item.py I have a definition of thisItem() which I assign to item in the code below. All worked rather swimmingly, parseusing a callback to get to parse_dir_content... But then I realized I needed to scrape an extra bit of data and created another function parse_other_content. How do I get what is already in item into parse_other_content?
import scrapy
from this-site.items import *
import re
import json
class DmozSpider(scrapy.Spider):
name = "ABB"
allowed_domains = ["this-site.com.au"]
start_urls = [
"https://www.this-site.com.au?page=1",
"https://www.this-site.com.au?page=2",
]
def parse(self, response):
for href in response.xpath('//h3/a/#href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parse_dir_contents)
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
item['title'] = sel.xpath('text()').extract()
item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
item['propid'] = propID
item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
item['rate_detail'] = sel["this"][0]["that"]
yield item
I know I am missing something simple here, but I can't seem to figure it out.
Per the scrapy documentation (http://doc.scrapy.org/en/1.0/topics/request-response.html#topics-request-response-ref-request-callback-arguments):
In some cases you may be interested in passing arguments to those callback functions so you can receive the arguments later, in the second callback. You can use the Request.meta attribute for that.
In your case I would do something like this:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
request.meta['item'] = item
yield request
def parse_other_content(self, response):
item = response.meta['item']
# do something with the item
return item
According to Steve (see comments) you can also pass a dictionary of meta data as a keyword argument to the Request constructor like so:
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
item = thisItem()
...
request = scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content, meta={'item':item})
yield request
You can either allow item to be visible to parse_other_content() by changing it to self.item, or sending it as a parameter to the function. (The first one might be easier.)
For the first solution just add self. to any reference to the item variable. This makes it visible to the entire class.
def parse_dir_contents(self, response):
for sel in response.xpath('//h1[#itemprop="name"]'):
self.item = thisItem()
self.item['title'] = sel.xpath('text()').extract()
self.item['rate'] = response.xpath('//div[#class="rate"]/div/span/text()').extract()
so = re.search( r'\d+', response.url)
propID = so.group()
self.item['propid'] = propID
self.item['link'] = response.url
yield scrapy.Request("https://www.this-site.com.au/something?listing_id="+propID,callback=self.parse_other_content)
#yield item
def parse_other_content(self, reponse):
sel = json.loads(reponse.body)
self.item['rate_detail'] = sel["this"][0]["that"]
yield self.item