scrapy transport start_url to subsequent requests - python

since three days I am trying to save the respective start_urs in a meta attribute to pass it as item to subsequent requests in scrapy, so I can use the start_url to call a dictionary to populate my output with additional data. Actually it should be straightforward, because it is explained in the documentation ...
There is a discussion in the google scrapy group and there was a question here also but I can't get it to run :(
I am new to scrapy and I think it is a great framework, but for my project I have to know the start_urls of all requests and it is quite complicated as it looks like.
I would really appreciate some help!
At the moment my code looks like this:
class example(CrawlSpider):
name = 'example'
start_urls = ['http://www.example.com']
rules = (
Rule(SgmlLinkExtractor(allow=('/blablabla/', )), callback='parse_item'),
)
def parse(self, response):
for request_or_item in super(example, self).parse(response):
if isinstance(request_or_item, Request):
request_or_item = request_or_item.replace(meta = {'start_url': response.meta['start_url']})
yield request_or_item
def make_requests_from_url(self, url):
return Request(url, dont_filter=True, meta = {'start_url': url})
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
item = testItem()
print response.request.meta, response.url

I wanted to delete this answer as it doesn't solve OP's problem, but i thought to leave it as a scrapy example.
Warning:
When writing crawl spider rules, avoid using parse as callback, since
the CrawlSpider uses the parse method itself to implement its logic.
So if you override the parse method, the crawl spider will no longer
work.
Use BaseSpider instead:
class Spider(BaseSpider):
name = "domain_spider"
def start_requests(self):
last_domain_id = 0
chunk_size = 10
cursor = settings.db.cursor()
while True:
cursor.execute("""
SELECT domain_id, domain_url
FROM domains
WHERE domain_id > %s AND scraping_started IS NULL
LIMIT %s
""", (last_domain_id, chunk_size))
self.log('Requesting %s domains after %s' % (chunk_size, last_domain_id))
rows = cursor.fetchall()
if not rows:
self.log('No more domains to scrape.')
break
for domain_id, domain_url in rows:
last_domain_id = domain_id
request = self.make_requests_from_url(domain_url)
item = items.Item()
item['start_url'] = domain_url
item['domain_id'] = domain_id
item['domain'] = urlparse.urlparse(domain_url).hostname
request.meta['item'] = item
cursor.execute("""
UPDATE domains
SET scraping_started = %s
WHERE domain_id = %s
""", (datetime.now(), domain_id))
yield request
...

Related

How to execute a method only after each http request succeeds in scrapy spider?

I have following spider:
import scrapy
from twisted.internet import defer
class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['https://quotes.toscrape.com/', 'https://quotes.toscrape.com/page/2/', 'https://quotes.toscrape.com/page/3/']
def parse(self, response):
# deferred = defer.Deferred()
# deferred.addCallback(self.parse_author)
# deferred.addCallback(self.process_engine)
self.logger.info("Parsing quote page!!")
data = {}
quotes = response.xpath('//div[#class="quote"]')
for i, quote in enumerate(quotes):
data['quote'] = quote.xpath('./span[#class="text"]/text()').get()
data['tags'] = quote.xpath('.//div[#class="tags"]/a[#class="tag"]/text()').getall()
data['author_url'] = response.urljoin(quote.xpath('.//a[starts-with(#href, "/author/")]/#href').get())
if data['author_url'] and i < 5:
yield scrapy.Request(data['author_url'], callback=self.parse_author, meta={'data': data})
# yield scrapy.Request(data['author_url'], callback=deferred.callback)
else:
self.logger.info("Author data not found!!")
yield data
if quotes:
self.process_engine()
def parse_author(self, response):
self.logger.info("Parsing author page!!")
data = response.meta.get('data')
data['author_name'] = response.xpath('//h3/text()').get()
data['author_dob'] = response.xpath('//span[#class="author-born-date"]/text()').get()
yield data
def process_engine(self, data=None):
self.logger.info("\n<<<<<<<<<<<<<<<<<<< process engine running >>>>>>>>>>>>>>>>>>>>>>>\n")
# some logic here
What I want here is to execute process_engine method only after each start_urls requests finishes successfully(i.e. there are quotes which may be stored in a database). But what is happening here is that while spider is waiting to fetch author_url data, process_engine method gets executed.
I tried using defer too, but didn't work or I didn't do it right way. How can we resolve this problem? Any help would be highly appreciated.

how to make an additional request and get data from it

I need to parse data from the site. After parsing, data must be saved to disk. I am using scrapy. When working, I need to get data from another page. How can I do that?
class MySpider(scrapy.Spider):
name = "my_spyder"
start_urls = [
'https://www.example.com/title/1',
'https://www.example.com/title/2',
'https://www.example.com/title/3',
]
def parse(self, response):
item = MyItem()
main_page_selector = Selector(response)
...
tagline_url = os.path.join(response.url, 'taglines')
request = Request(url=tagline_url, callback=get_tags)
item['tags'] = yield request
...
yield item
def get_tags(self, response):
tagline_selector = Selector(response)
taglines = []
for tag in tagline_selector.xpath('//div[#class="soda even"))]/text()').getall():
taglines.append(tag.strip())
return taglines
how to write in the 'item' field 'tags' received during the function 'get_tags'?
these requests are executed asynchronously.
request = Request(url=tagline_url, callback=get_tags)
request.meta["item"] = item
yield request
Above code on parse method
item = response.meta["item"]
#...
item["tags"] = taglines
yield item
The second code in the get_tags method

IMDB scrapy get all movie data

I am working on a class project and trying to get all IMDB movie data (titles, budgets. etc.) up until 2016. I adopted the code from https://github.com/alexwhb/IMDB-spider/blob/master/tutorial/spiders/spider.py.
My thought is: from i in range(1874,2016) (since 1874 is the earliest year shown on http://www.imdb.com/year/), direct the program to the corresponding year's website, and grab the data from that url.
But the problem is, each page for each year only show 50 movies, so after crawling the 50 movies, how can I move on to the next page? And after crawling each year, how can I move on to next year? This is my code for the parsing url part so far, but it is only able to crawls 50 movies for a particular year.
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = ["http://www.imdb.com/search/title?year=2014,2014&title_type=feature&sort=moviemeter,asc"]
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
You can use CrawlSpiders to simplify your task. As you'll see below, start_requests dynamically generates the list of URLs while parse_page only extracts the movies to crawl. Finding and following the 'Next' link is done by the rules attribute.
I agree with #Padraic Cunningham that hard-coding values is not a great idea. I've added spider arguments so that you can call:
scrapy crawl imdb -a start=1950 -a end=1980 (the scraper will default to 1874-2016 if it doesn't get any arguments).
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from imdbyear.items import MovieItem
class IMDBSpider(CrawlSpider):
name = 'imdb'
rules = (
# extract links at the bottom of the page. note that there are 'Prev' and 'Next'
# links, so a bit of additional filtering is needed
Rule(LinkExtractor(restrict_xpaths=('//*[#id="right"]/span/a')),
process_links=lambda links: filter(lambda l: 'Next' in l.text, links),
callback='parse_page',
follow=True),
)
def __init__(self, start=None, end=None, *args, **kwargs):
super(IMDBSpider, self).__init__(*args, **kwargs)
self.start_year = int(start) if start else 1874
self.end_year = int(end) if end else 2016
# generate start_urls dynamically
def start_requests(self):
for year in range(self.start_year, self.end_year+1):
yield scrapy.Request('http://www.imdb.com/search/title?year=%d,%d&title_type=feature&sort=moviemeter,asc' % (year, year))
def parse_page(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
# note -- you had 'MianPageUrl' as your scrapy field name. I would recommend fixing this typo
# (you will need to change it in items.py as well)
item['MainPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MainPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
# make sure that the dynamically generated start_urls are parsed as well
parse_start_url = parse_page
# do your magic
def parseMovieDetails(self, response):
pass
you can use the below piece of code to follow the next page
#'a.lister-page-next.next-page::attr(href)' is the selector to get the next page link
next_page = response.css('a.lister-page-next.nextpage::attr(href)').extract_first() # joins current and next page url
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse) # calls parse function again when crawled to next page
I figured out a very dumb way to solve this. I put all the links in the start_urls. Better solution would be very much appreciated!
class tutorialSpider(scrapy.Spider):
name = "tutorial"
allowed_domains = ["imdb.com"]
start_urls = []
for i in xrange(1874, 2017):
for j in xrange(1, 11501, 50):
# since the largest number of movies for a year to have is 11,400 (2016)
start_url = "http://www.imdb.com/search/title?sort=moviemeter,asc&start=" + str(j) + "&title_type=feature&year=" + str(i) + "," + str(i)
start_urls.append(start_url)
def parse(self, response):
for sel in response.xpath("//*[#class='results']/tr/td[3]"):
item = MovieItem()
item['Title'] = sel.xpath('a/text()').extract()[0]
item['MianPageUrl']= "http://imdb.com"+sel.xpath('a/#href').extract()[0]
request = scrapy.Request(item['MianPageUrl'], callback=self.parseMovieDetails)
request.meta['item'] = item
yield request
The code that #Greg Sadetsky has provided needs some minor changes. Well only one change that is in the first line of parse_page method.
Just change xpath in the for loop from:
response.xpath("//*[#class='results']/tr/td[3]"):
to
response.xpath("//*[contains(#class,'lister-item-content')]/h3"):
This worked like a charm for me!

How to feed a spider with links crawled within the spider?

I'm writing a spider (CrawlSpider) for an online store. According to client requisites, I need to write two rules: one for determining which pages have items and other for extracting the items.
I have both rules already working independently:
if my start_urls = ["www.example.com/books.php",
"www.example.com/movies.php"] and I comment the Rule and the code
of parse_category, my parse_item will extract every item.
On the other hand, if start_urls = "http://www.example.com" and I
comment the Ruleand the code of parse_item, parse_category will
return every link in which there a items for extracting, i.e.
parse_category will return www.example.com/books.php and
www.example.com/movies.php.
My problem is that I don't know how to merge both modules, so that start_urls = "http://www.example.com" and then parse_category extracts www.example.com/books.php and www.example.com/movies.php and feed those links to parse_item, where I actually extract the info of each item.
I need to find a way to do it this way instead of just using start_urls = ["www.example.com/books.php", "www.example.com/movies.php"] because if in the future a new category is added (e.g. www.example.com/music.php), the spider wouldn't be able to automatically detect that new category and should be manually edited. Not a big deal, but the client doesn't want this.
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
#start_urls = ["http://www.example.com/books.php", "http://www.example.com/movies.php"]
rules = (
Rule(LinkExtractor(), follow=True, callback='parse_category'),
Rule(LinkExtractor(), follow=False, callback="parse_item"),
)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
return category
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
the CrawlSpider rules don't work like you want, you'll need to implement the logic by yourself. when you specify follow=True you can't use callback, because the idea is to keep getting links (no items) while following the rules, check the documentation
you could try with something like:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
# no rules
def parse(self, response): # this is parse_category
category_le = LinkExtractor("something for categories")
for a in category_le.extract_links(response):
yield Request(a.url, callback=self.parse_category)
item_le = LinkExtractor("something for items")
for a in item_le.extract_links(response):
yield Request(a.url, callback=self.parse_item)
def parse_category(self, response):
category = StoreCategory()
# some code for determining whether the current page is a category, or just another stuff
if is a category:
category['name'] = name
category['url'] = response.url
yield category
for req in self.parse(response):
yield req
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Instead of using a parse_category, I used restrict_css in LinkExtractorto get the links I want, and it seems to be feeding the second Rule with the extracted links, so my question is answered. It ended up this way:
class StoreSpider (CrawlSpider):
name = "storyder"
allowed_domains = ["example.com"]
start_urls = ["http://www.example.com/"]
rules = (
Rule(LinkExtractor(restrict_css=("#movies", "#books"))),
Rule(LinkExtractor(), callback="parse_item"),
)
def parse_item(self, response):
item = StoreItem()
# some code for extracting the item's data
return item
Still it can't detect new added categories (and there is not a clear pattern for using in restrict_css without fetching other garbage), but at least it's complying with the requisites of the client: 2 rules, one for extracting category's links and other for extracting item's data.

How to return items from my CrawlSpider?

I want to start scraping from one page and traverse to 100s of page using next url, which i have written in following code. I need to go to another link in that crawling and extract data and store in the items. I can easily print all the items data to be exported but not able to return from the function as desired.
class UserLoginCrawl(CrawlSpider):
name = "mylogin"
allowed_domains = ['www.example.com']
login_page = "www.example.com/user"
start_urls = ["www.example.com/profile?page=0"]
rules = [Rule(SgmlLinkExtractor(
allow = ('/profile\?page=\d+'),
restrict_xpaths = ('//li[#class="pager-next"]',),canonicalize=False ),
callback = 'parse_page',
follow=True),]
# ulists = []
def parse_page(self, response):
self.log ('XYZ, Started Crawling %s' %response.url)
items = response.xpath("//div[#id='profile']/div")
for temp in items:
userurl = 'www.example.com'+temp.xpath("./div[#class='name']/a/#href").extract()[0]
yield Request(url=userurl,callback=self.parse_profile_page)
self.log ('XYZ, Finished Crawling %s' %response.url)
# return self.ulists
def parse_profile_page(self, response):
usritem = PostUsers()
self.log ('XYZ, Started Crawling user Profile %s' %response.url)
usritem["userlink"] = response.url
usritem["fullname"] = response.xpath("//h1[#id='page-title']/text()").extract()
relative_url = response.xpath("//div[#id='nav-content']/ul/li[2]/a/#href").extract()[0]
usritem["postlink"] = 'www.example.com'+relative_url
usritem["history"] = response.xpath("//div[#id='user_user_full_group_profile_main']/dl/dd[1]/text()").extract()
# self.ulists.append(usritem)
print usritem
# return usritem
Use yield usritem at the end of your parse method.
See the second example of Spider Examples

Categories