I'm using following code I found online to recursively scrape links on multi pages. It's supposed to return me all the links I need on all pages recursively. But I ended up with only getting 100 links at maximum. Any advice will be helpful.
class MySpider(CrawlSpider):
name = "craigs"
allowed_domains = ["craigslist.org"]
start_urls = ["http://seattle.craigslist.org/search/jjj?is_parttime=1"]
rules = (Rule (SgmlLinkExtractor(allow=("index\d00\.html", ),restrict_xpaths=('//a[#class="button next"]',))
, callback="parse_items", follow= True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//span[#class="pl"]')
items = []
for titles in titles:
item = CraigslistSampleItem()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return(items)
Just eliminate allow=("index\d00\.html", ) to let it parse the next link:
rules = (Rule(SgmlLinkExtractor(restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_items", follow= True),)
Related
I want to extract the title and the pdf link of each paper in this link: https://iclr.cc/Conferences/2019/Schedule?type=Poster
My code is here
class ICLRCrawler(Spider):
name = "ICLRCrawler"
allowed_domains = ["iclr.cc"]
start_urls = ["https://iclr.cc/Conferences/2019/Schedule?type=Poster", ]
def parse(self, response):
papers = Selector(response).xpath('//*[#id="content"]/div/div[#class="paper"]')
titles = Selector(response).xpath('//*[#id="maincard_704"]/div[3]')
links = Selector(response).xpath('//*[#id="maincard_704"]/div[6]/a[2]')
for title, link in zip(titles, links):
item = PapercrawlerItem()
item['title'] = title.xpath('text()').extract()[0]
item['pdf'] = link.xpath('/#href').extract()[0]
item['sup'] = ''
yield item
However, it seems that it is not easy to get the title and link of each paper. Here, how can I change the code to get the data?
You can use much simpler approach:
def parse(self, response):
for poster in response.xpath('//div[starts-with(#id, "maincard_")]'):
item = PapercrawlerItem()
item["title"] = poster.xpath('.//div[#class="maincardBody"]/text()[1]').get()
item["pdf"] = poster.xpath('.//a[#title="PDF"]/#href').get()
yield item
you have to replace Extract()[0] with get_attribute('href')
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
"http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
]
def parse(self, response):
for sel in response.xpath('//div[#id="site-list-content"]/div[#class="site-item "]'):
#for sel in response.xpath('//ul/li'):
item = DmozItem()
item['title'] = sel.xpath('div[#class="title-and-desc"]//div[#class="site-title"]/text()').extract()
item['link'] = sel.xpath('div[#class="title-and-desc"]//a/#href').extract()
item['desc'] = sel.xpath('normalize-space(div[#class="title-and-desc"]//div[#class="site-descr "]/text())').extract()
yield item
That is the code for my scraper. I followed the tutorial on the scrapy website, but the code was a little bit outdated so I had to change the code myself. The code works with the /python/books site, but not with the /resources one. Can anyone provide an explanation as to why this happens? Thank you.
I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title
According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)
I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.
I am scraping this site and I'm using Scrapy as the means. However, I am having trouble with the XPath. I'm not entirely sure what is going on:
Why does this work:
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
yield item
and the following code not?
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//body'):
item['title'] = title.xpath('h1/strong/text()').extract()
item['wage'] = title.xpath('h1/span[#class="price"]/text()').extract()
yield item
I aim to also extract the XPath for:
//div[#id="description"]/p
But I can't because it is outside the h1 node. How can I achieve this? My full code is:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bot.items import BotItem
class MufmufSpider(CrawlSpider):
name = 'mufmuf'
allowed_domains = ['mufmuf.ro']
start_urls = ['http://mufmuf.ro/locuri-de-munca/joburi-in-strainatate/']
rules = (
Rule(
LinkExtractor(restrict_xpaths='//div[#class="paginate"][position() = last()]'),
#callback='parse_start_url',
follow=True
),
Rule(
LinkExtractor(restrict_xpaths='//h3/a'),
callback='parse_item',
follow=True
),
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
#item['description'] = title.xpath('div[#id="descirption"]/p/text()').extract()
yield item
The for title in response.xpath('//body'): option does not work because your XPath expressions in the loop make it search for h1 element directly inside the body element.
Moreover, since there is only one desired entity to extract you don't need a loop here at all:
def parse_item(self, response):
item = BotItem()
item["title"] = response.xpath('//h1/strong/text()').extract()
item["wage"] = response.xpath('//h1/span[#class="price"]/text()').extract()
item["description"] = response.xpath('//div[#id="description"]/p/text()').extract()
return item
(this should also answer your second question about the description)