class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//div[#id="site-list-content"]/div[#class="site-item "]'):
#for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('div[#class="title-and-desc"]//div[#class="site-title"]/text()').extract()
item['link'] = sel.xpath('div[#class="title-and-desc"]//a/#href').extract()
item['desc'] = sel.xpath('normalize-space(div[#class="title-and-desc"]//div[#class="site-descr "]/text())').extract()
yield item
That is the code for my scraper. I followed the tutorial on the scrapy website, but the code was a little bit outdated so I had to change the code myself. The code works with the /python/books site, but not with the /resources one. Can anyone provide an explanation as to why this happens? Thank you.
Related
I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')
I'm using a DOWNLOADER_MIDDLEWARES for rotating proxies with an scrapy.Spider and I would like to get an item , i.e. item['proxy_used'], for the proxy used for each request.
I guess it could be possible to get the Proxy over the "Stats Collector" but I'm new to Python and Scrapy and till now I haven't been able to come across with a solution.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = ??? <-- PROXY USED BY REQUEST - "HOW TO???"
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
You can use the response object to access the proxy used. Like below
response.meta.get("proxy")
Updated in your code too.
import scrapy
from tutorial.items import QuotesItem
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ["quotes.toscrape.com"]
start_urls = [
'http://quotes.toscrape.com/',
]
def parse_quotes(self, response):
for sel in response.css('div.quote'):
item = QuotesItem()
item['text'] = sel.css('span.text::text').get()
item['author'] = sel.css('small.author::text').get()
item['tags'] = sel.css('div.tags a.tag::text').getall()
item['quotelink'] = sel.css('small.author ~ a[href*="goodreads.com"]::attr(href)').get()
item['proxy_used'] = response.meta.get("proxy")
yield item
# follow pagination links #shortcut
for a in response.css('li.next a'):
yield response.follow(a, callback = self.parse_quotes)
Hi I would like to scrape 2 different domain in my script I have tried my if statement but I it seems that it is not working, any idea please?
Here's my code
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com']
start_urls = [
'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999',
'https://www.forever21.com/us/shop/catalog/category/f21/sale',
]
def parse_start_url(response):
if (response.url == 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'):
parse_1(response)
if (response.url == 'https://www.forever21.com/us/shop/catalog/category/f21/sale'):
parse_2(response)
def parse_1(self, response):
for product_item in response.css('li.product-item'):
item = {
'title': product_item.css('h3.item-heading a.link::text').extract_first(),
'regular-price': product_item.css('strong.item-price span.price.regular::text').extract_first(),
'sale-price': product_item.css('strong.item-price span.price.sale::text').extract_first(),
'photo-url': product_item.css('.image-container img::attr(data-src)').extract_first(),
'description-url': "https://www2.hm.com/" + product_item.css('h3.item-heading a::attr(href)').extract_first(),
}
yield item
def parse_2(self, response):
#Some code getting item on domain 2
Please Help thank you
Check your allowed_domains variable. You should add new domain, like ['www2.hm.com', 'forever21.com'] or remove it at all. Also you have no parse function.
I can suppose to remove your start_urls with if and use start_requests instead. Your code will be more readable.
import scrapy
class SalesitemSpiderSpider(scrapy.Spider):
name = 'salesitem_spider'
allowed_domains = ['www2.hm.com', 'forever21.com']
def start_requests(self):
urls = (
(self.parse_1, 'https://www2.hm.com/en_us/sale/shopbyproductladies/view-all.html?sort=stock&image-size=small&image=stillLife&offset=0&page-size=9999'),
(self.parse_2, 'https://www.forever21.com/us/shop/catalog/category/f21/sale'),
)
for cb, url in urls:
yield scrapy.Request(url, callback=cb)
def parse_1(self, response):
print 111111111
def parse_2(self, response):
print 2222222222
I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.
I'm using following code I found online to recursively scrape links on multi pages. It's supposed to return me all the links I need on all pages recursively. But I ended up with only getting 100 links at maximum. Any advice will be helpful.
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://seattle.craigslist.org/search/jjj?is_parttime=1"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//a[#class="button next"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return(items)
Just eliminate allow=("index\d00\.html", ) to let it parse the next link:
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_items", follow= True),)