Get RSS links given a domain - python

I have a file which has a list of domains. I need to crawl the domain(i.e. the whole website) to get rss links. Recursively crawl each page of the website to get rss links from each page and write to a json file corresponding to the domain This is my code just for one website:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[#type=application/rss+xml]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
tried running
scrapy crawl apple -o items.json -t json
But items.json only contains a bracket [
This is my items.py file:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()

Your XPath expression needs to have quotes around the "application/rss+xml" test value.
Try something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[#type="application/rss+xml"]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item

Related

How to use Scrapy for URL crawling

I want to crawl the link https://www.aparat.com/.
I crawl it correctly and get all the video links with header tag;like this :
import scrapy
class BlogSpider(scrapy.Spider):
name = 'aparatspider'
start_urls = ['https://www.aparat.com/']
def parse(self, response):
print '=' * 80 , 'latest-trend :'
ul5 = response.css('.block-grid.xsmall-block-grid-2.small-block-grid-3.medium-block-grid-4.large-block-grid-5.is-not-center')
ul5 = ul5.css('ul').css('li')
latesttrend = []
for li5 in ul5:
latesttrend.append(li5.xpath('div/div[1]/a').xpath('#onmousedown').extract_first().encode('utf8'))
print(latesttrend)
now my question is this:
How can I get all the links from the داغ ترین ها tag, more than 1000? Currently, I get only 60, more or less.
I did this with the following code :
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.http import Request
class aparat_hotnewsItem(scrapy.Item):
videourl = scrapy.Field()
class aparat_hotnewsSpider(CrawlSpider):
name = 'aparat_hotnews'
allowed_domains = ['www.aparat.com']
start_urls = ['http://www.aparat.com/']
# Xpath for selecting links to follow
xp = 'your xpath'
rules = (
Rule(LinkExtractor(restrict_xpaths=xp), callback='parse_item', follow=True),
)
def parse_item(self, response):
item = aparat_hotnewsItem()
item['videourl'] = response.xpath('your xpath').extract()
yield item

Scrapy Spider cannot Extract contents of web page using xpath

I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item

How to recursively crawl whole website using scrapy

I want to crawl complete website using scrapy but right now its only crawling single page
import scrapy
from scrapy.http import HtmlResponse
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import JsonItemExporter
class IzodspiderSpider(scrapy.Spider):
name = 'izodspider'
allowed_domains = ['izod.com']
start_urls = ['http://izod.com/']
rules = [Rule(SgmlLinkExtractor(), callback='parse_item', follow=True)]
def parse(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()
is there any way to extract meta tags using portia ?
There is an error in the rule definition and inside the callback.
Since the parse function you use is parse_item you have to call it inside the callback instead of parse
You can find more information about the callback function on the documentation here http://doc.scrapy.org/en/latest/topics/request-response.html?highlight=callback#topics-request-response-ref-request-callback-arguments
class IzodspiderSpider(CrawlSpider):
name = "izod"
depth_limit= 0
bot_name = 'izod'
allowed_domains = ['izod.com']
start_urls = ['http://www.izod.com']
rules = (
Rule(SgmlLinkExtractor(allow=('')), callback='parse_items',follow= True),
)
def parse_items(self, response):
hxs = scrapy.Selector(response)
meta = hxs.xpath('//meta[#name=\'description\']/#content').extract()
name = hxs.xpath('//div[#id=\'product-details\']/h5').extract()
desc = hxs.xpath('//div[#id=\'product-details\']/p').extract()

Crawling depth automation

My site contain 3 levels.
Country
City
Street
I want to scrape the data from all the street pages. For this I have built a spider.
Now how do I get from Country to streets without adding a million URL's in the start_url field.
Do I build a spider for country, one for city and one for street?
Isn't the whole idea of Crawling that the crawler follows all links down to a certain depth?
Adding DEPTH_LIMIT = 3 to the settings.py file did not change anything.
I start the crawl by: scrapy crawl spidername
EDIT
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.spider import Spider
from scrapy.selector import Selector
from winkel.items import WinkelItem
class DmozSpider(CrawlSpider):
name = "dmoz"
allowed_domains = ["mydomain.nl"]
start_urls = [
"http://www.mydomain.nl/Zuid-Holland"
]
rules = (Rule(SgmlLinkExtractor(allow=('*Zuid-Holland*', )), callback='parse_winkel', follow=True),)
def parse_winkel(self, response):
sel = Selector(response)
sites = sel.xpath('//ul[#id="itemsList"]/li')
items = []
for site in sites:
item = WinkelItem()
item['adres'] = site.xpath('.//a/text()').extract(), site.xpath('text()').extract(), sel.xpath('//h1/text()').re(r'winkel\s*(.*)')
items.append(item)
return items
You need to make use of CrawlSpider, define Rules with Link Extractors for countries, cities and streets.
For example:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import Selector
class MySpider(CrawlSpider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
Rule(SgmlLinkExtractor(allow=('country', )), callback='parse_country'),
Rule(SgmlLinkExtractor(allow=('city', )), callback='parse_city'),
Rule(SgmlLinkExtractor(allow=('street', )), callback='parse_street'),
)
def parse_country(self, response):
self.log('Hi, this is a country page! %s' % response.url)
def parse_city(self, response):
self.log('Hi, this is a city page! %s' % response.url)
def parse_street(self, response):
self.log('Hi, this is a street page! %s' % response.url)

Scrapy: only parse from pages with meta noindex

I am trying to crawl a website and parse only from pages with meta noindex.
What is happening is that the crawler crawls the first level, but finishes with the first page. It does not seem to follow the links.
The following is my code:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[#content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
the original _response_downloaded calls _parse_response function that besides calling callback function also follow links, from scrapy code:
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
you can add that follow link part though I believe it's not the best way to go (leading _ may imply just that), why not just check for meta in the beginning of your parse_items function? and if you don't want to repeat this test maybe even write a python decorator.
I believe checking for the meta at the beginning of my parse_items as #Guy Gavriely suggested will be my best option. I will test out the following code below to see.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
Working code update, I needed to return items instead of yield:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items

Categories