so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
Related
I'm new to Scrapy and I'm currently making a spider that extracts only the event title and event description from a website. I am able to get the title and description, however, the spider is also trying to extract data from a pdf link which causes a "raise NotSupported("Response content isn't text")" error. How can I prevent the spider from doing this?
Here is my code:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class EventsspiderSpider(CrawlSpider):
name = 'eventsspider'
allowed_domains =['cs.acadiau.ca']
start_urls = ['https://cs.acadiau.ca/news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html']
rules = (
Rule(LinkExtractor(allow=('news-events/event-reader/using-dna-to-reverse-engineer-your-family-tree.html', )), callback='parse_item', follow=True),)
def parse_item(self, response):
i = {}
title_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/h1/text()').extract()
data_list = response.xpath('//*[#id="event-items-15421"]/div[2]/div/div[1]/p[7]/span/text()').extract()
for x in range(0, len(title_list)):
i['title'] = title_list[x]
i['data'] = data_list[x]
yield i
http://www.bbc.com/news/business-41097280
Is the website I want the regular expression for.
So far, I am using the following, where
'.+\/news\/business[-.]\d{8}$
Which is part of this code segment here, used with Scrapy
from scrapy.item import Item, Field
from scrapy.contrib.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
class TryItem(Item):
url = Field()
class BbchrcrawlerSpider(CrawlSpider):
name = "bbchrcrawler"
allowed_domains = ["www.bbc.com"]
start_urls = ['http://www.bbc.com/news/business-']
rules = (Rule(LinkExtractor(allow=['.+\/news\/business+\-d{8}$']), callback='parse_item', follow=True),)
def parse_item(self, response):
Item = TryItem()
Item['url'] = response.url
yield Item
What's the correct way to get the URL there for extracting multiple pages with the same format?
The result should collect URLs with the following format:
bbc.com/news/business-########
You can try this:
pattern = "bbc\.com/news/business-\d+"
rules = (Rule(LinkExtractor(allow=[pattern]), callback='parse_item', follow=True),)
I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title
According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)
I am having problems fully understanding how SGML Link Extractor works. When making a crawler with Scrapy, I can successfully extract data from links using specific URLS. The problem is using Rules to follow a next page link in a particular URL.
I think the problem lies in the allow() attribute. When the Rule is added to the code, the results do not display in the command line and the link to the next page is not followed.
Any help is greatly appreciated.
Here is the code...
import scrapy
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider
from scrapy.contrib.spiders import Rule
from tutorial.items import TutorialItem
class AllGigsSpider(CrawlSpider):
name = "allGigs"
allowed_domains = ["http://www.allgigs.co.uk/"]
start_urls = [
"http://www.allgigs.co.uk/whats_on/London/clubbing-1.html",
"http://www.allgigs.co.uk/whats_on/London/festivals-1.html",
"http://www.allgigs.co.uk/whats_on/London/comedy-1.html",
"http://www.allgigs.co.uk/whats_on/London/theatre_and_opera-1.html",
"http://www.allgigs.co.uk/whats_on/London/dance_and_ballet-1.html"
]
rules = (Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('//div[#class="more"]',)), callback="parse_me", follow= True),
)
def parse_me(self, response):
hxs = HtmlXPathSelector(response)
infos = hxs.xpath('//div[#class="entry vevent"]')
items = []
for info in infos:
item = TutorialItem()
item ['artist'] = hxs.xpath('//span[#class="summary"]//text()').extract()
item ['date'] = hxs.xpath('//abbr[#class="dtstart dtend"]//text()').extract()
item ['endDate'] = hxs.xpath('//abbr[#class="dtend"]//text()').extract()
item ['startDate'] = hxs.xpath('//abbr[#class="dtstart"]//text()').extract()
items.append(item)
return items
print items
The problem is in the restrict_xpaths - it should point to a block where a link extractor should look for links. Don't specify allow at all:
rules = [
Rule(SgmlLinkExtractor(restrict_xpaths='//div[#class="more"]'),
callback="parse_me",
follow=True),
]
And you need to fix your allowed_domains:
allowed_domains = ["www.allgigs.co.uk"]
Also note that the print items in the parse_me() callback is not reachable since it lies after the return statement. And, in the loop, you should not apply XPath expression using hxs, the expressions should be used in the info context. And you can simplify the parse_me():
def parse_me(self, response):
for info in response.xpath('//div[#class="entry vevent"]'):
item = TutorialItem()
item['artist'] = info.xpath('.//span[#class="summary"]//text()').extract()
item['date'] = info.xpath('.//abbr[#class="dtstart dtend"]//text()').extract()
item['endDate'] = info.xpath('.//abbr[#class="dtend"]//text()').extract()
item['startDate'] = info.xpath('.//abbr[#class="dtstart"]//text()').extract()
yield item
sHey I've just started using scrapy and was trying it out on a website "diy.com" but i cant seem to get the CrawlSpider to follow links or scrape any data. I think it might be my regex but i cant see anything
any help will be appreciated
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from items import PartItem
class DIY_spider(CrawlSpider):
name = 'diy_cat'
allowed_domains = ['diy.com']
start_urls =[
"http://www.diy.com/nav/decor/tiles/wall-tiles"
]
rules = (
Rule(SgmlLinkExtractor(allow=(r'/(nav)/(decor)/(\w*)/(.*)(\d*)$', ),deny=(r'(.*)/(jsp)/(.*)')), callback='parse_item',follow = True),
def parse_items(self, response):
sel = Selector(response)
tests =[]
test = PartItem()
if sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()') :
price = sel.xpath('//*[#id="fullWidthContent"]/div[2]/dl/dd[1]/ul[1]/li[3]/text()')
else:
price= sel.xpath('//dd[#class="item_cta"]/ul[#class="fright item_price"]/li/text()').extract()
if not price:
return test
return test
Besides, #Talvalin's note, you are not getting actual prices.
Try this version of parse_item:
def parse_item(self, response):
sel = Selector(response)
price_list = sel.xpath('//span[#class="onlyPrice"]/text()').extract()
for price in price_list:
if price:
item = PartItem()
item['price'] = price
yield item
Your rule states parse_item as the callback but the actual callback is named parse_items. Additionally, the indenting for the parse_items function is incorrect, but that could simply be a formatting issue when pasting the code in.