Crawling depth automation

Crawling depth automation - python

My site contain 3 levels.
Country
City
Street
I want to scrape the data from all the street pages. For this I have built a spider.
Now how do I get from Country to streets without adding a million URL's in the start_url field.
Do I build a spider for country, one for city and one for street?
Isn't the whole idea of Crawling that the crawler follows all links down to a certain depth?
Adding DEPTH_LIMIT = 3 to the settings.py file did not change anything.
I start the crawl by: scrapy crawl spidername
EDIT
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import Spider
from scrapy.selector import Selector
from winkel.items import WinkelItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["mydomain.nl"]
start_urls = [
"http://www.mydomain.nl/Zuid-Holland"
]
rules = (Rule(SgmlLinkExtractor(allow=('*Zuid-Holland*', )), callback='parse_winkel', follow=True),)
def parse_winkel(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#id="itemsList"]/li')
items = []
for site in sites:
item = WinkelItem()
item['adres'] = site.xpath('.//a/text()').extract(), site.xpath('text()').extract(), sel.xpath('//h1/text()').re(r'winkel\s*(.*)')
items.append(item)
return items

You need to make use of CrawlSpider, define Rules with Link Extractors for countries, cities and streets.
For example:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
class MySpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
Rule(SgmlLinkExtractor(allow=('country', )), callback='parse_country'),
Rule(SgmlLinkExtractor(allow=('city', )), callback='parse_city'),
Rule(SgmlLinkExtractor(allow=('street', )), callback='parse_street'),
)
def parse_country(self, response):
self.log('Hi, this is a country page! %s' % response.url)
def parse_city(self, response):
self.log('Hi, this is a city page! %s' % response.url)
def parse_street(self, response):
self.log('Hi, this is a street page! %s' % response.url)

Related

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.

The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

How to recursively crawl whole website using scrapy

I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?

There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()

Scrapy Spider does not enter parse_item method using SgmlLinkExtractor Rule

I am making a crawler to crawl the website recursively but the problem is the spider does not enter the parse_item method.The name of my spider is example.py. The code is given below:
from scrapy.spider import Spider
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from scrapy.http.request import Request
from scrapy.utils.response import get_base_url
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = [
"http://www.dmoz.org/Arts/"
]
print start_urls
rules = (
Rule(SgmlLinkExtractor(allow=('/Arts', )), callback='parse_item',follow=True),
)
#The spide is not entering into this parse_item
def parse_item(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title

Why are you trying to define and call a function explicitly?
Try this:
class CrawlSpider(CrawlSpider):
name = "example"
download_delay = 2
allowed_domains = ["dmoz.org"]
print allowed_domains
start_urls = ["http://www.dmoz.org/Arts/"]
def parse(self, response):
print "hello parse"
sel = Selector(response)
title = sel.xpath('//title/text()').extract()
print title

use CrawlSpider in scraping

I try to do it with a CrawlSpider and this is the code but the spider didn't return a result (opened and closed after) :
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from torent.items import TorentItem
class MultiPagesSpider(CrawlSpider):
name = 'job'
allowed_domains = ['tanitjobs.com/']
start_urls = ['http://tanitjobs.com/browse-by-category/Nurse/?searchId=1393459812.065&action=search&page=1&view=list',]
rules = (
Rule (SgmlLinkExtractor(allow=('page=*',),restrict_xpaths=('//div[#class="pageNavigation"]',))
, callback='parse_item', follow= True),
)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items= hxs.select('//div[#class="offre"]/div[#class="detail"]')
scraped_items =[]
for item in items:
scraped_item = TorentItem()
scraped_item["title"] = item.select('a/strong/text()').extract()
scraped_items.append(scraped_item)
return items

What #paul t. said in the comment above, but additionally you need to return scraped_items rather than items otherwise you'll get a large number of errors that look like this:
2014-02-26 23:40:59+0000 [job] ERROR: Spider must return Request, BaseItem or None, got 'HtmlXPathSelector' in
<GET http://tanitjobs.com/browse-by-category/Nurse/?action=search&page=3&searchId=1393459812.065&view=list>

Get RSS links given a domain

I have a file which has a list of domains. I need to crawl the domain(i.e. the whole website) to get rss links. Recursively crawl each page of the website to get rss links from each page and write to a json file corresponding to the domain This is my code just for one website:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[#type=application/rss+xml]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
tried running
scrapy crawl apple -o items.json -t json
But items.json only contains a bracket [
This is my items.py file:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()

Your XPath expression needs to have quotes around the "application/rss+xml" test value.
Try something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[#type="application/rss+xml"]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Crawling depth automation - python

Related

scrapy spider code check

How to recursively crawl whole website using scrapy

Scrapy Spider does not enter parse_item method using SgmlLinkExtractor Rule

use CrawlSpider in scraping

Get RSS links given a domain

Categories

Resources