I started to use Scrapy yesterday, following this modified version of Scrapy: https://github.com/prncc/steam-scraper to get Steam Reviews information. The existing code allows for continuous scrolling until there is no review left to scrape. However, I need to modify it a bit to be able to get values from another page; more specifically, on a webpage like this https://steamcommunity.com/app/416600/reviews for instance, I would like to get the number of reviews of each reviewer, which are displayed only on their review page (like this one https://steamcommunity.com/profiles/76561197993023168/recommended/, who has 14 reviews).
The original code reads:
class ReviewSpider(scrapy.Spider):
name = 'reviews'
test_urls = [
# Full Metal Furies
'http://steamcommunity.com/app/416600/reviews/?browsefilter=mostrecent&p=1',
]
def __init__(self, url_file=None, steam_id=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_file = url_file
self.steam_id = steam_id
def read_urls(self):
with open(self.url_file, 'r') as f:
for url in f:
url = url.strip()
if url:
yield scrapy.Request(url, callback=self.parse)
def start_requests(self):
if self.steam_id:
url = (
f'http://steamcommunity.com/app/{self.steam_id}/reviews/'
'?browsefilter=mostrecent&p=1'
)
yield Request(url, callback=self.parse)
elif self.url_file:
yield from self.read_urls()
else:
for url in self.test_urls:
yield Request(url, callback=self.parse)
def parse(self, response):
page = get_page(response)
product_id = get_product_id(response)
# Load all reviews on current page.
reviews = response.css('div .apphub_Card')
for i, review in enumerate(reviews):
yield load_review(review, product_id, page, i)
# Navigate to next page.
form = response.xpath('//form[contains(#id, "MoreContentForm")]')
if form:
yield self.process_pagination_form(form, page, product_id)
def process_pagination_form(self, form, page=None, product_id=None):
action = form.xpath('#action').extract_first()
names = form.xpath('input/#name').extract()
values = form.xpath('input/#value').extract()
formdata = dict(zip(names, values))
meta = dict(prev_page=page, product_id=product_id)
return FormRequest(
url=action,
method='GET',
formdata=formdata,
callback=self.parse,
meta=meta
)
What I tried to do is to add this in the parse function, just to get the number of reviews for a given user:
def parse(self, response):
page = get_page(response)
product_id = get_product_id(response)
# Load all reviews on current page.
reviews = response.css('div .apphub_Card')
for i, review in enumerate(reviews):
yield load_review(review, product_id, page, i)
Reviewers = response.xpath("/html/body/div[1]/div[5]/div[5]/div/div[1]/div/div/a[1]") #Get the path for each reviewer
for IndividualReview in Reviewers:
num_reviews = IndividualReview.xpath(".//#href").get()
yield {
'num_reviews': num_reviews
}
# Navigate to next page.
form = response.xpath('//form[contains(#id, "MoreContentForm")]')
if form:
yield self.process_pagination_form(form, page, product_id)
But it did not work. The main issue is that I am not familiar in xpath in general, and I do not really understand how Scrapy is supposed to go to the other page, get the information desired and then go back, iteratively for each review on a given game. How can I tackle this issue?
Related
I'm in reach of a personal milestone with scrapy. The aim is to properly understand the callback and cb_kwargs, I've read the documentation countless times but I learn best with visual code, practice and an explanation.
I have an example scraper, the aim is to grab the book name, price and go into each book page and extract a single piece of information. I'm trying to understand how to properly get information on the next few pages also, which I know is dependent on understanding the operation of callbacks.
When I run my script It returns results only for the first page, how do I get the additional pages?
Here's my scraper:
class BooksItem(scrapy.Item):
items = Field(output_processor = TakeFirst())
price = Field(output_processor = TakeFirst())
availability = Field(output_processor = TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]')
for books in data:
loader = ItemLoader(BooksItem(), selector = books)
loader.add_xpath('items','.//article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price','.//p[#class="price_color"]//text()')
for url in [books.xpath('.//a//#href').get()]:
yield scrapy.Request(
response.urljoin(url),
callback = self.parse_book,
cb_kwargs = {'loader':loader})
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
book_quote = response.xpath('//p[#class="instock availability"]//text()').get()
loader.add_value('availability', book_quote)
yield loader.load_item()
I believe the issue is with the part where I try to grab the next few pages. I have tried an alternative approach using the following:
def start_request(self):
for url in self.start_url:
yield scrapy.Request(
url,
callback = self.parse,
cb_kwargs = {'page_count':0}
)
def parse(self, response, next_page):
if page_count > 3:
return
...
...
page_count += 1
for next_page in [response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()]:
yield response.follow(next_page, callback=self.parse, cb_kwargs = {'page_count': page_count})
However, I get the following error with this approach:
TypeError: parse() missing 1 required positional argument: 'page_cntr'
It should be start_requests, and self.start_urls (inside the function).
get() will return the first result, what you want is getall() in order to return a list.
There is no need for a for loop for the "next_page" part, it's not a mistake just unnecessary.
In the line for url in books.xpath you're getting every url twice, again not a mistake but still...
Here data = response.xpath('//div[#class = "col-sm-8 col-md-9"]') you don't select the books one by one, you select the whole books container, you can check that len(data.getall()) == 1.
book_quote = response.xpath('//p[#class="instock availability"]//text()').get() will return \n, look at the source at try to find out why (hint: 'i' tag).
Compare your code to this and see what I changed:
import scrapy
from scrapy import Field
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst
class BooksItem(scrapy.Item):
items = Field(output_processor=TakeFirst())
price = Field(output_processor=TakeFirst())
availability = Field(output_processor=TakeFirst())
class BookSpider(scrapy.Spider):
name = "books"
start_urls = ['https://books.toscrape.com']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback=self.parse)
def parse(self, response):
data = response.xpath('//div[#class = "col-sm-8 col-md-9"]//li')
for books in data:
loader = ItemLoader(BooksItem(), selector=books)
loader.add_xpath('items', './/article[#class="product_pod"]/h3/a//text()')
loader.add_xpath('price', './/p[#class="price_color"]//text()')
for url in books.xpath('.//h3/a//#href').getall():
yield scrapy.Request(
response.urljoin(url),
callback=self.parse_book,
cb_kwargs={'loader': loader})
next_page = response.xpath('.//div/ul[#class="pager"]/li[#class="next"]/a//#href').get()
if next_page:
yield response.follow(next_page, callback=self.parse)
def parse_book(self, response, loader):
# option 1:
book_quote = response.xpath('//p[#class="instock availability"]/i/following-sibling::text()').get().strip()
# option 2:
# book_quote = ''.join(response.xpath('//div[contains(#class, "product_main")]//p[#class="instock availability"]//text()').getall()).strip()
loader.add_value('availability', book_quote)
yield loader.load_item()
I'm trying to scrape this website https://phdessay.com/free-essays/.
I need to find the maximum number of pages so that I can append the URLs with page numbers to the start_urls list. I'm not able to figure out how to do that.
Here's my code so far,
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
In the above code, I'm following each essay to it's individual page and am scraping the URL and the title (I will be scraping the content which is the reason why I'm following the individual essay URL).
This works just fine for the starting page; but there are about 1677 pages which might change in the future. I would like to scrape this maximum_no_of_pages number and then append all links with all page numbers.
What you could do is to find the last page number and then do a range loop to yield next pages requests.
Something like this:
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
max_page = int(response.css('.page-numbers::text').getall()[-1])
for page_number in range(1, max_page + 1):
page_url = f'https://phdessay.com/free-essays/page/{page_number}/'
yield scrapy.Request(page_url, callback=self.parse_page)
def parse_page(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
I'm scraping dior.com for its products. head/script gives me all the fields I need except for a product description. To scrape the description I need to follow the link (the url variable in the code below). The only way to do that I'm familiar with is by using BeautifulSoup. Can I parse it using only Scrapy?
Thx guys.
class DiorSpider(CrawlSpider):
name = 'dior'
allowed_domains = ['www.dior.com']
start_urls = ['https://www.dior.com/en_us/']
rules = (
Rule(LinkExtractor(allow=(r'^https?://www.dior.com/en_us/men/clothing/new-arrivals.*',)), callback='parse_file')
)
def parse_file(self, response):
script_text = response.xpath("//script[contains(., 'window.initialState')]").extract_first()
blocks = extract_blocks(script_text)
for block in blocks:
sku = re.compile(r'("sku":)"[a-zA-Z0-9_]*"').finditer(block)
url = re.compile(r'("productLink":{"uri":)"[^"]*').finditer(block)
for item in zip(sku, url):
scraped_info = {
'sku': item[0].group(0).split(':')[1].replace('"', ''),
'url': 'https://www.dior.com' + item[1].group(0).split(':')[2].replace('"', '')
}
yield scraped_info
If you need to extract additional information from a second request, instead of yielding the data there, you should yield a request for the URL that includes the information you already extracted in the Request.meta attribute.
from scrapy import Request
# …
def parse_file(self, response):
# …
for block in blocks:
# …
for item in zip(sku, url):
# …
yield Request(url, callback=self.parse_additional_information, meta={'scraped_info': scraped_info}
def parse_additional_information(self, response):
scraped_info = response.meta['scraped_info']
# extract the additional information, add it to scraped_info
yield scraped_info
I'm new to scrapy and I struggle a little with a special case.
Here is the scenario :
I want to scrap a website where there is a list of books.
httpx://...bookshop.../archive is the page where all the 10 firsts books are listed.
Then I want to get the informations (name, date, author) of all the books in the list. I have to go on another page for each books:
httpx://...bookshop.../book/{random_string}
So there is two types of request :
One for refreshing the list of books.
Another one for getting the book informations.
But some books can be added to the list at anytime.
So I would like to refresh the list every minutes.
and I also want to delay all the request by 5 seconds.
Here my basic solution, but it only works for one "loop" :
First I set the delay in settings.py :
DOWNLOAD_DELAY = 5
then the code of my spider :
from scrapy.loader import ItemLoader
class bookshopScraper(scrapy.Spider):
name = "bookshop"
url = "httpx://...bookshop.../archive"
history = []
last_refresh = 0
def start_requests(self):
self.last_refresh = time.time()
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[3]
if page == 'archive':
return self.parse_archive(response)
else:
return self.parse_book(response)
def parse_archive(self, response):
links = response.css('SOME CSS ').extract()
for link in links:
if link not in self.history:
self.history.append(link)
yield scrapy.Request(url="httpx://...bookshop.../book/" + link, callback=self.parse)
if len(self.history) > 10:
n = len(self.history) - 10
self.history = history[-n:]
def parse_book(self, response):
"""
Load Item
"""
Now I would like to do something like :
if(time.time() > self.last_refresh + 80):
self.last_refresh = time.time()
return scrapy.Request(url=self.url, callback=self.parse, dont_filter=True)
But I really don't know how to implement this.
PS : I want the same instance of scrapy to run all the time without stopping.
Edited question to link to original:
Scrapy getting data from links within table
From the link https://www.tdcj.state.tx.us/death_row/dr_info/trottiewillielast.html
I am trying to get info from the main table as well as the data within the other 2 links within the table. I managed to pull from one, but question is going to the other link and appending the data in one line.
from urlparse import urljoin
import scrapy
from texasdeath.items import DeathItem
class DeathItem(Item):
firstName = Field()
lastName = Field()
Age = Field()
Date = Field()
Race = Field()
County = Field()
Message = Field()
Passage = Field()
class DeathSpider(scrapy.Spider):
name = "death"
allowed_domains = ["tdcj.state.tx.us"]
start_urls = [
"http://www.tdcj.state.tx.us/death_row/dr_executed_offenders.html"
]
def parse(self, response):
sites = response.xpath('//table/tbody/tr')
for site in sites:
item = DeathItem()
item['firstName'] = site.xpath('td[5]/text()').extract()
item['lastName'] = site.xpath('td[4]/text()').extract()
item['Age'] = site.xpath('td[7]/text()').extract()
item['Date'] = site.xpath('td[8]/text()').extract()
item['Race'] = site.xpath('td[9]/text()').extract()
item['County'] = site.xpath('td[10]/text()').extract()
url = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
if url.endswith("html"):
request = scrapy.Request(url, meta={"item": item,"url2" : url2}, callback=self.parse_details)
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['Message'] = response.xpath("//p[contains(text(), 'Last Statement')]/following-sibling::p/text()").extract()
request = scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
return request
def parse_details2(self, response):
item = response.meta["item"]
item['Passage'] = response.xpath("//p/text()").extract_first()
return item
I understand how we pass arguments to a request and meta. But still unclear of the flow, at this point I am unsure whether this is possible or not. I have viewed several examples including the ones below:
using scrapy extracting data inside links
How can i use multiple requests and pass items in between them in scrapy python
Technically the data will reflect the main table just with both links containing data from within its link.
Appreciate any help or direction.
The problem in this case is in this piece of code
if url.endswith("html"):
yield scrapy.Request(url, meta={"item": item}, callback=self.parse_details)
else:
yield item
if url2.endswith("html"):
yield scrapy.Request(url2, meta={"item": item}, callback=self.parse_details2)
else:
yield item
By requesting a link you are creating a new "thread" that will take its own course of life so, the function parse_details wont be able to see what is being done in parse_details2, the way I would do it is call one within each other this way
url = urljoin(response.url, site.xpath("td[2]/a/#href").extract_first())
url2 = urljoin(response.url, site.xpath("td[3]/a/#href").extract_first()
if url.endswith("html"):
request=scrapy.Request(url, callback=self.parse_details)
request.meta['item']=item
request.meta['url2']=url2
yield request
elif url2.endswith("html"):
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
def parse_details(self, response):
item = response.meta["item"]
url2 = response.meta["url2"]
item['About Me'] = response.xpath("//p[contains(text(), 'About Me')]/following-sibling::p/text()").extract()
if url2:
request=scrapy.Request(url2, callback=self.parse_details2)
request.meta['item']=item
yield request
else:
yield item
This code hasn't been tested thoroughly so comment as you test