I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title
According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)
Related
Tried to parse product name and price from a site using scrapy. However, When i run my scrapy code it neither shows any error nor fetches any data. What I'm doing wrong is beyond my capability to find out. Hope there is someone to take a look into it.
"items.py" includes:
import scrapy
class SephoraItem(scrapy.Item):
Name = scrapy.Field()
Price = scrapy.Field()
spider file named "sephorasp.py" contains:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class SephoraspSpider(CrawlSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = ["https://www.sephora.ae/en/stores/"]
rules = [
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
callback="parse_item")
]
def parse_item(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product = titles.xpath('.//a[#title]/text()').extract()
Rate = titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Here is the Link to the Log:
"https://www.dropbox.com/s/8xktgh7lvj4uhbh/output.log?dl=0"
It works when I play around with BaseSpider:
from scrapy.spider import BaseSpider
from scrapy.http.request import Request
class SephoraspSpider(BaseSpider):
name = "sephorasp"
allowed_domains = ['sephora.ae']
start_urls = [
"https://www.sephora.ae/en/travel-size/make-up",
"https://www.sephora.ae/en/perfume/women-perfume",
"https://www.sephora.ae/en/makeup/eye/eyeshadow",
"https://www.sephora.ae/en/skincare/moisturizers",
"https://www.sephora.ae/en/gifts/palettes"
]
def pro(self, response):
item_links = response.xpath('//a[contains(#class,"level0")]/#href').extract()
for a in item_links:
yield Request(a, callback = self.end)
def end(self, response):
item_link = response.xpath('//a[#class="level2"]/#href').extract()
for b in item_link:
yield Request(b, callback = self.parse)
def parse(self, response):
page = response.xpath('//div[#class="product-info"]')
for titles in page:
Product= titles.xpath('.//a[#title]/text()').extract()
Rate= titles.xpath('.//span[#class="price"]/text()').extract()
yield {'Name':Product,'Price':Rate}
Your xpaths are heavily flawed.
Rule(LinkExtractor(restrict_xpaths='//li[#class="level0 nav-1 active first touch-dd parent"]')),
Rule(LinkExtractor(restrict_xpaths='//li[#class="level2 nav-1-1-1 active first"]'),
You are matching whole class ranges which can change at any point and the order might be different in scrapy. Just pick one class, it's most likely unique enough:
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level0")]')),
Rule(LinkExtractor(restrict_xpaths='//li[contains(#class,"level2")]')),
I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.
so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
I am having problems fully understanding how SGML Link Extractor works. When making a crawler with Scrapy, I can successfully extract data from links using specific URLS. The problem is using Rules to follow a next page link in a particular URL.
I think the problem lies in the allow() attribute. When the Rule is added to the code, the results do not display in the command line and the link to the next page is not followed.
Any help is greatly appreciated.
Here is the code...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[#class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[#class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[#class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[#class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[#class="dtstart"]//text()').extract()
items.append(item)
return items
print items
The problem is in the restrict_xpaths - it should point to a block where a link extractor should look for links. Don't specify allow at all:
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'),
callback="parse_me",
follow=True),
]
And you need to fix your allowed_domains:
allowed_domains = ["www.allgigs.co.uk"]
Also note that the print items in the parse_me() callback is not reachable since it lies after the return statement. And, in the loop, you should not apply XPath expression using hxs, the expressions should be used in the info context. And you can simplify the parse_me():
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]'):
item = TutorialItem()
item['artist'] = info.xpath('.//span[#class="summary"]//text()').extract()
item['date'] = info.xpath('.//abbr[#class="dtstart dtend"]//text()').extract()
item['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract()
item['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract()
yield item
I am attempting to scrape the Library of Congress/Thomas website. This Python script is intended to access a sample of 40 bills from their site (# 1-40 identifiers in the URLs). I want to parse the body of each piece of legislation, search in the body/content, extract links to potential multiple versions & follow.
Once on the version page(s) I want to parse the body of each piece of legislation, search the body/content & extract links to potential sections & follow.
Once on the section page(s) I want to parse the body of each section of a bill.
I believe there is some issue with the Rules/LinkExtractor segment of my code. The python code is executing, crawling the start urls, but not parsing or any of the subsequent tasks.
Three issues:
Some bills do not have multiple versions (and ergo no links in the body portion of the URL
Some bills do not have linked sections because they are so short, while some are nothing but links to sections.
Some section links do not contain just section-specific content, and most of the content is just redundant inclusion of prior or subsequent section content.
My question is again, why is Scrapy not crawling or parsing?
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
spider = Lrn2CrawlSpider()
Just for the record, the problem with your script is that the variable rules is not inside the scope of Lrn2CrawlSpider because it doesn't share the same indentation, so when alecxe fixed the indentation the variable rules became now an attribute of the class. Later the inherited method __init__() reads the attribute and compiles the rules and enforces them.
def __init__(self, *a, **kw):
super(CrawlSpider, self).__init__(*a, **kw)
self._compile_rules()
Erasing the last line had nothing to do with that.
I've just fixed the indentation, removed spider = Lrn2CrawlSpider() line at the end of the script, ran the spider via scrapy runspider lrn2crawl.py and it scrapes, follows links, returns items - your rules work.
Here's what I'm running:
from scrapy.item import Item, Field
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class BillItem(Item):
title = Field()
body = Field()
class VersionItem(Item):
title = Field()
body = Field()
class SectionItem(Item):
body = Field()
class Lrn2CrawlSpider(CrawlSpider):
name = "lrn2crawl"
allowed_domains = ["thomas.loc.gov"]
start_urls = ["http://thomas.loc.gov/cgi-bin/query/z?c107:H.R.%s:" % bill for bill in xrange(000001,00040,00001) ### Sample of 40 bills; Total range of bills is 1-5767
]
rules = (
# Extract links matching /query/ fragment (restricting tho those inside the content body of the url)
# and follow links from them (since no callback means follow=True by default).
# Desired result: scrape all bill text & in the event that there are multiple versions, follow them & parse.
Rule(SgmlLinkExtractor(allow=(r'/query/'), restrict_xpaths=('//div[#id="content"]')), callback='parse_bills', follow=True),
# Extract links in the body of a bill-version & follow them.
#Desired result: scrape all version text & in the event that there are multiple sections, follow them & parse.
Rule(SgmlLinkExtractor(restrict_xpaths=('//div/a[2]')), callback='parse_versions', follow=True)
)
def parse_bills(self, response):
hxs = HtmlXPathSelector(response)
bills = hxs.select('//div[#id="content"]')
scraped_bills = []
for bill in bills:
scraped_bill = BillItem() ### Bill object defined previously
scraped_bill['title'] = bill.select('p/text()').extract()
scraped_bill['body'] = response.body
scraped_bills.append(scraped_bill)
return scraped_bills
def parse_versions(self, response):
hxs = HtmlXPathSelector(response)
versions = hxs.select('//div[#id="content"]')
scraped_versions = []
for version in versions:
scraped_version = VersionItem() ### Version object defined previously
scraped_version['title'] = version.select('center/b/text()').extract()
scraped_version['body'] = response.body
scraped_versions.append(scraped_version)
return scraped_versions
def parse_sections(self, response):
hxs = HtmlXPathSelector(response)
sections = hxs.select('//div[#id="content"]')
scraped_sections = []
for section in sections:
scraped_section = SectionItem() ## Segment object defined previously
scraped_section['body'] = response.body
scraped_sections.append(scraped_section)
return scraped_sections
Hope that helps.