I'm new to scrapy and I struggle a little with a special case.
Here is the scenario :
I want to scrap a website where there is a list of books.
httpx://...bookshop.../archive is the page where all the 10 firsts books are listed.
Then I want to get the informations (name, date, author) of all the books in the list. I have to go on another page for each books:
httpx://...bookshop.../book/{random_string}
So there is two types of request :
One for refreshing the list of books.
Another one for getting the book informations.
But some books can be added to the list at anytime.
So I would like to refresh the list every minutes.
and I also want to delay all the request by 5 seconds.
Here my basic solution, but it only works for one "loop" :
First I set the delay in settings.py :
DOWNLOAD_DELAY = 5
then the code of my spider :
from scrapy.loader import ItemLoader
class bookshopScraper(scrapy.Spider):
name = "bookshop"
url = "httpx://...bookshop.../archive"
history = []
last_refresh = 0
def start_requests(self):
self.last_refresh = time.time()
yield scrapy.Request(url=self.url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[3]
if page == 'archive':
return self.parse_archive(response)
else:
return self.parse_book(response)
def parse_archive(self, response):
links = response.css('SOME CSS ').extract()
for link in links:
if link not in self.history:
self.history.append(link)
yield scrapy.Request(url="httpx://...bookshop.../book/" + link, callback=self.parse)
if len(self.history) > 10:
n = len(self.history) - 10
self.history = history[-n:]
def parse_book(self, response):
"""
Load Item
"""
Now I would like to do something like :
if(time.time() > self.last_refresh + 80):
self.last_refresh = time.time()
return scrapy.Request(url=self.url, callback=self.parse, dont_filter=True)
But I really don't know how to implement this.
PS : I want the same instance of scrapy to run all the time without stopping.
Related
I want to get the url of a video (.mp4) from an iframe using python(or rust) (doesn't matter which library). For example, I have:
<iframe src="https://spinning.allohalive.com/?kp=1332827&token=b51bdfc8af17dee996d3eae53726df" />
I really have no idea how to do this. Help me please! If you need some more information, just ask.
The code that I use to parse iframes from a website:
import scrapy
from cimber.models.website import Website
class KinokradSpider(scrapy.Spider):
name = "kinokrad"
start_urls = [Website.Kinokrad.value]
def __init__(self):
self.pages_count = 1
def parse(self, response):
pages_count = self.get_pages_count(response)
if self.pages_count <= pages_count:
for film in response.css("div.shorposterbox"):
film_url = film.css("div.postertitle").css("a").attrib["href"]
yield scrapy.Request(film_url, callback=self.parse_film)
next_page = f"{Website.Kinokrad.value}/page/{self.pages_count}"
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
self.pages_count += 1
def parse_film(self, response):
name = response.css("div.fallsttitle").css("h1::text").get().strip()
players = []
for player in response.css("iframe::attr(src)").extract():
players.append(player)
yield {
"name": name,
"players": players
}
def get_pages_count(self, response) -> int:
links = response.css("div.navcent").css("a")
last_link = links[len(links) - 1].attrib["href"]
return int(last_link.split("/page/")[1].replace("/", "").strip())
I've been trying for 2 weeks but finally I'm asking this question on StackOverflow. First I used Bs4, then Selenium, and now scrapy. I have a large code to automatically parse iframes, but I need mp4 url. I've already tried solutions on StackOVerflow, but They don't work, so please don't remove my question.
I started to use Scrapy yesterday, following this modified version of Scrapy: https://github.com/prncc/steam-scraper to get Steam Reviews information. The existing code allows for continuous scrolling until there is no review left to scrape. However, I need to modify it a bit to be able to get values from another page; more specifically, on a webpage like this https://steamcommunity.com/app/416600/reviews for instance, I would like to get the number of reviews of each reviewer, which are displayed only on their review page (like this one https://steamcommunity.com/profiles/76561197993023168/recommended/, who has 14 reviews).
The original code reads:
class ReviewSpider(scrapy.Spider):
name = 'reviews'
test_urls = [
# Full Metal Furies
'http://steamcommunity.com/app/416600/reviews/?browsefilter=mostrecent&p=1',
]
def __init__(self, url_file=None, steam_id=None, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url_file = url_file
self.steam_id = steam_id
def read_urls(self):
with open(self.url_file, 'r') as f:
for url in f:
url = url.strip()
if url:
yield scrapy.Request(url, callback=self.parse)
def start_requests(self):
if self.steam_id:
url = (
f'http://steamcommunity.com/app/{self.steam_id}/reviews/'
'?browsefilter=mostrecent&p=1'
)
yield Request(url, callback=self.parse)
elif self.url_file:
yield from self.read_urls()
else:
for url in self.test_urls:
yield Request(url, callback=self.parse)
def parse(self, response):
page = get_page(response)
product_id = get_product_id(response)
# Load all reviews on current page.
reviews = response.css('div .apphub_Card')
for i, review in enumerate(reviews):
yield load_review(review, product_id, page, i)
# Navigate to next page.
form = response.xpath('//form[contains(#id, "MoreContentForm")]')
if form:
yield self.process_pagination_form(form, page, product_id)
def process_pagination_form(self, form, page=None, product_id=None):
action = form.xpath('#action').extract_first()
names = form.xpath('input/#name').extract()
values = form.xpath('input/#value').extract()
formdata = dict(zip(names, values))
meta = dict(prev_page=page, product_id=product_id)
return FormRequest(
url=action,
method='GET',
formdata=formdata,
callback=self.parse,
meta=meta
)
What I tried to do is to add this in the parse function, just to get the number of reviews for a given user:
def parse(self, response):
page = get_page(response)
product_id = get_product_id(response)
# Load all reviews on current page.
reviews = response.css('div .apphub_Card')
for i, review in enumerate(reviews):
yield load_review(review, product_id, page, i)
Reviewers = response.xpath("/html/body/div[1]/div[5]/div[5]/div/div[1]/div/div/a[1]") #Get the path for each reviewer
for IndividualReview in Reviewers:
num_reviews = IndividualReview.xpath(".//#href").get()
yield {
'num_reviews': num_reviews
}
# Navigate to next page.
form = response.xpath('//form[contains(#id, "MoreContentForm")]')
if form:
yield self.process_pagination_form(form, page, product_id)
But it did not work. The main issue is that I am not familiar in xpath in general, and I do not really understand how Scrapy is supposed to go to the other page, get the information desired and then go back, iteratively for each review on a given game. How can I tackle this issue?
I'm trying to scrape this website https://phdessay.com/free-essays/.
I need to find the maximum number of pages so that I can append the URLs with page numbers to the start_urls list. I'm not able to figure out how to do that.
Here's my code so far,
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
In the above code, I'm following each essay to it's individual page and am scraping the URL and the title (I will be scraping the content which is the reason why I'm following the individual essay URL).
This works just fine for the starting page; but there are about 1677 pages which might change in the future. I would like to scrape this maximum_no_of_pages number and then append all links with all page numbers.
What you could do is to find the last page number and then do a range loop to yield next pages requests.
Something like this:
class PhdessaysSpider(scrapy.Spider):
name = 'phdessays'
start_urls = ['https://phdessay.com/free-essays/']
def parse(self, response):
max_page = int(response.css('.page-numbers::text').getall()[-1])
for page_number in range(1, max_page + 1):
page_url = f'https://phdessay.com/free-essays/page/{page_number}/'
yield scrapy.Request(page_url, callback=self.parse_page)
def parse_page(self, response):
all_essay_urls = response.css('.phdessay-card-read::attr(href)').getall()
for essay_url in all_essay_urls:
yield scrapy.Request(essay_url, callback=self.parse_essay_contents)
def parse_essay_contents(self, response):
items = PhdEssaysItem()
essay_title = response.css('.site-title::text').get()
essay_url = response.request.url
items['essay_title'] = essay_title
items['essay_url'] = essay_url
yield items
I am working on a class project and trying to get all IMDB movie data (titles, budgets. etc.) up until 2016. I adopted the code from https://github.com/alexwhb/IMDB-spider/blob/master/tutorial/spiders/spider.py.
My thought is: from i in range(1874,2016) (since 1874 is the earliest year shown on http://www.imdb.com/year/), direct the program to the corresponding year's website, and grab the data from that url.
But the problem is, each page for each year only show 50 movies, so after crawling the 50 movies, how can I move on to the next page? And after crawling each year, how can I move on to next year? This is my code for the parsing url part so far, but it is only able to crawls 50 movies for a particular year.
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = ["http://www.imdb.com/search/title?year=2014,2014&title_type=feature&sort=moviemeter,asc"]
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
You can use CrawlSpiders to simplify your task. As you'll see below, start_requests dynamically generates the list of URLs while parse_page only extracts the movies to crawl. Finding and following the 'Next' link is done by the rules attribute.
I agree with #Padraic Cunningham that hard-coding values is not a great idea. I've added spider arguments so that you can call:
scrapy crawl imdb -a start=1950 -a end=1980 (the scraper will default to 1874-2016 if it doesn't get any arguments).
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from imdbyear.items import MovieItem
class IMDBSpider(CrawlSpider):
name = 'imdb'
rules = (
# extract links at the bottom of the page. note that there are 'Prev' and 'Next'
# links, so a bit of additional filtering is needed
Rule(LinkExtractor(restrict_xpaths=('//*[#id="right"]/span/a')),
process_links=lambda links: filter(lambda l: 'Next' in l.text, links),
callback='parse_page',
follow=True),
)
def __init__(self, start=None, end=None, *args, **kwargs):
super(IMDBSpider, self).__init__(*args, **kwargs)
self.start_year = int(start) if start else 1874
self.end_year = int(end) if end else 2016
# generate start_urls dynamically
def start_requests(self):
for year in range(self.start_year, self.end_year+1):
yield scrapy.Request('http://www.imdb.com/search/title?year=%d,%d&title_type=feature&sort=moviemeter,asc' % (year, year))
def parse_page(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
# note -- you had 'MianPageUrl' as your scrapy field name. I would recommend fixing this typo
# (you will need to change it in items.py as well)
item['MainPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MainPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
# make sure that the dynamically generated start_urls are parsed as well
parse_start_url = parse_page
# do your magic
def parseMovieDetails(self, response):
pass
you can use the below piece of code to follow the next page
#'a.lister-page-next.next-page::attr(href)' is the selector to get the next page link
next_page = response.css('a.lister-page-next.nextpage::attr(href)').extract_first() # joins current and next page url
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse) # calls parse function again when crawled to next page
I figured out a very dumb way to solve this. I put all the links in the start_urls. Better solution would be very much appreciated!
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = []
for i in xrange(1874, 2017):
for j in xrange(1, 11501, 50):
# since the largest number of movies for a year to have is 11,400 (2016)
start_url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=" + str(j) + "&title_type=feature&year=" + str(i) + "," + str(i)
start_urls.append(start_url)
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
The code that #Greg Sadetsky has provided needs some minor changes. Well only one change that is in the first line of parse_page method.
Just change xpath in the for loop from:
response.xpath("//*[#class='results']/tr/td[3]"):
to
response.xpath("//*[contains(#class,'lister-item-content')]/h3"):
This worked like a charm for me!
I have done and spider that can take the information of this page and it can follow "Next page" links. Now, the spider just takes the information that i'm showing in the following structure.
The structure of the page is something like this
Title 1
URL 1 ---------> If you click you go to one page with more information
Location 1
Title 2
URL 2 ---------> If you click you go to one page with more information
Location 2
Next page
Then, that i want is that the spider goes on each URL link and get full information. I suppose that i must generate another rule that specify that i want do something like this.
The behaviour of the spider it should be:
Go to URL1 (get info)
Go to URL2 (get info)
...
Next page
But i don't know how i can implement it. Can someone guide me?
Code of my Spider:
class BcnSpider(CrawlSpider):
name = 'bcn'
allowed_domains = ['guia.bcn.cat']
start_urls = ['http://guia.bcn.cat/index.php?pg=search&q=*:*']
rules = (
Rule(
SgmlLinkExtractor(
allow=(re.escape("index.php")),
restrict_xpaths=("//div[#class='paginador']")),
callback="parse_item",
follow=True),
)
def parse_item(self, response):
self.log("parse_item")
sel = Selector(response)
sites = sel.xpath("//div[#id='llista-resultats']/div")
items = []
cont = 0
for site in sites:
item = BcnItem()
item['id'] = cont
item['title'] = u''.join(site.xpath('h3/a/text()').extract())
item['url'] = u''.join(site.xpath('h3/a/#href').extract())
item['when'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[1]/text()').extract())
item['where'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[2]/span/a/text()').extract())
item['street'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[3]/span/text()').extract())
item['phone'] = u''.join(site.xpath('div[#class="dades"]/dl/dd[4]/text()').extract())
items.append(item)
cont = cont + 1
return items
EDIT After searching in internet I found a code with which i can do that.
First of all, I have to get all the links, then I have to call another parse method.
def parse(self, response):
#Get all URL's
yield Request( url= _url, callback=self.parse_details )
def parse_details(self, response):
#Detailed information of each page
If you want use Rules because the page have a paginator, you should change def parse to def parse_start_url and then call this method through Rule. With this changes you make sure that the parser begins at the parse_start_url and the code it would be something like this:
rules = (
Rule(
SgmlLinkExtractor(
allow=(re.escape("index.php")),
restrict_xpaths=("//div[#class='paginador']")),
callback="parse_start_url",
follow=True),
)
def parse_start_url(self, response):
#Get all URL's
yield Request( url= _url, callback=self.parse_details )
def parse_details(self, response):
#Detailed information of each page
Thant's all folks
There is an easier way of achieving this. Click next on your link, and read the new url carefully:
http://guia.bcn.cat/index.php?pg=search&from=10&q=*:*&nr=10
By looking at the get data in the url (everything after the questionmark), and a bit of testing, we find that these mean
from=10 - Starting index
q=*:* - Search query
nr=10 - Number of items to display
This is how I would've done it:
Set nr=100 or higher. (1000 may do as well, just be sure that there is no timeout)
Loop from from=0 to 34300. This is above the number of entries currently. You may want to extract this value first.
Example code:
entries = 34246
step = 100
stop = entries - entries % step + step
for x in xrange(0, stop, step):
url = 'http://guia.bcn.cat/index.php?pg=search&from={}&q=*:*&nr={}'.format(x, step)
# Loop over all entries, and open links if needed