Scrapy, Scraper not going through all links - python

class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//div[#id="site-list-content"]/div[#class="site-item "]'):
#for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('div[#class="title-and-desc"]//div[#class="site-title"]/text()').extract()
item['link'] = sel.xpath('div[#class="title-and-desc"]//a/#href').extract()
item['desc'] = sel.xpath('normalize-space(div[#class="title-and-desc"]//div[#class="site-descr "]/text())').extract()
yield item
That is the code for my scraper. I followed the tutorial on the scrapy website, but the code was a little bit outdated so I had to change the code myself. The code works with the /python/books site, but not with the /resources one. Can anyone provide an explanation as to why this happens? Thank you.

Related

Scrapy, crawl data by onclick

I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')

How to get the proxy used for each request in an item with Scrapy?

I'm using a DOWNLOADER_MIDDLEWARES for rotating proxies with an scrapy.Spider and I would like to get an item , i.e. item['proxy_used'], for the proxy used for each request.
I guess it could be possible to get the Proxy over the "Stats Collector" but I'm new to Python and Scrapy and till now I haven't been able to come across with a solution.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = ??? <-- PROXY USED BY REQUEST - "HOW TO???"
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
You can use the response object to access the proxy used. Like below
response.meta.get("proxy")
Updated in your code too.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = response.meta.get("proxy")
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)

How to scrape on two different domain using scrapy?

Hi I would like to scrape 2 different domain in my script I have tried my if statement but I it seems that it is not working, any idea please?
Here's my code
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com']
start_urls = [
'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999',
'https://www.forever21.com/us/shop/catalog/category/f21/sale',
]
def parse_start_url(response):
if (response.url == 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'):
parse_1(response)
if (response.url == 'https://www.forever21.com/us/shop/catalog/category/f21/sale'):
parse_2(response)
def parse_1(self, response):
for product_item in response.css('li.product-item'):
item = {
'title': product_item.css('h3.item-heading a.link::text').extract_first(),
'regular-price': product_item.css('strong.item-price span.price.regular::text').extract_first(),
'sale-price': product_item.css('strong.item-price span.price.sale::text').extract_first(),
'photo-url': product_item.css('.image-container img::attr(data-src)').extract_first(),
'description-url': "https://www2.hm.com/" + product_item.css('h3.item-heading a::attr(href)').extract_first(),
}
yield item
def parse_2(self, response):
#Some code getting item on domain 2
Please Help thank you
Check your allowed_domains variable. You should add new domain, like ['www2.hm.com', 'forever21.com'] or remove it at all. Also you have no parse function.
I can suppose to remove your start_urls with if and use start_requests instead. Your code will be more readable.
import scrapy
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com', 'forever21.com']
def start_requests(self):
urls = (
(self.parse_1, 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'),
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_1(self, response):
print 111111111
def parse_2(self, response):
print 2222222222

Getting data from multiple links using scrapy

I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.

using scrapy get links recursively on multipage

I'm using following code I found online to recursively scrape links on multi pages. It's supposed to return me all the links I need on all pages recursively. But I ended up with only getting 100 links at maximum. Any advice will be helpful.
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://seattle.craigslist.org/search/jjj?is_parttime=1"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//a[#class="button next"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return(items)
Just eliminate allow=("index\d00\.html", ) to let it parse the next link:
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_items", follow= True),)

Categories