I wrote a scrapy spider that has many start_urls and extracts email adresses in these urls. The script takes ages to execute so I want to tell Scrapy to stop crawling a particular site when it finds an email and move to the next site.
EDIT: added code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse
from entreprise.items import MailItem
class MailSpider(CrawlSpider):
name = "mail"
start_urls = []
allowed_domains = []
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if (url.strip() != ""):
start_urls.append(url)
fragments = urlparse(url).hostname.split(".")
hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
allowed_domains.append(hostname)
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for mail in hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+'):
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
return items
The idea is to use start_requests method to decide what urls to crawl next. Additionally, we'll keep track if an email was parsed for the hostname in parsed_hostnames class-level set.
Also, I've changed the way you get the hostname from url, using urlparse now.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "mail"
parsed_hostnames= set()
allowed_domains = []
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
]
def start_requests(self):
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if url:
hostname = urlparse(url).hostname
if hostname not in self.parsed_hostnames:
if hostname not in self.allowed_domains:
self.allowed_domains.append(hostname)
self.rules[0].link_extractor.allow_domains.add(hostname)
self.rules[1].link_extractor.allow_domains.add(hostname)
yield self.make_requests_from_url(url)
else:
self.allowed_domains.remove(hostname)
self.rules[0].link_extractor.allow_domains.remove(hostname)
self.rules[1].link_extractor.allow_domains.remove(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for mail in hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+'):
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
Should work in theory. Hope that helps.
I ended using process_links
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "mail"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item', process_links='process_links'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item', process_links='process_links')
]
start_urls = []
allowed_domains = []
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
mails = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if mails:
for mail in mails:
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
Related
Is there regex to select on the index page when crawling a specific website? I'm select certain pages but also need just the index page on top of those.
I can't seem to figure out the proper way to put it. Basically, I want to crawl index page, contact page, about page, and advertise page to look for contact information.
Here is the code.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[#name=\'description\']/#content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
What you need to do is to call parse_item() callback from the parse_start_url() - this way you would also parse the url coming from start_urls, which is I am assuming is an index page:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
See also:
Scrapy CrawlSpider doesn't crawl the first landing page
We're trying to crawl items such as 'product', 'price', etc. but we keep getting a indentation error.
The code we're using (crawlproduct.py):
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from productcrawl.items import ProductCrawlItem
class MySpider(BaseSpider):
name = "crawlproduct"
allowed_domains = ["yorcom.nl"]
f = open("items.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
events = hxs.select("//div[#class='productOverview']")
items = []
for event in events:
item = ProductCrawlItem()
item ["product"] = events.select("table/tbody/tr/td[#class='productTitle']/a/text()").extract()
item ["price"] = events.select("table/tbody/tr/td[#class='productPrice']/a/text()").extract()
item ["stock"] = events.select("table/tbody/tr/td[#class='productStock voorraad']/a/text()").extract()
item ["link"] = events.select("table/tbody/tr/td[#class='productTitle']/a").extract()
yield item
and items.py:
from scrapy.item import Item, Field
class ProductCrawlItem(Item):
product = Field()
price = Field()
stock = Field()
link = Field()
When we only use 1 field, it does work...
Does anyone know the problem?
Thanks in advance,
Dean
With the following indentation, this is probably what you intended:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from productcrawl.items import ProductCrawlItem
class MySpider(BaseSpider):
name = "crawlproduct"
allowed_domains = ["yorcom.nl"]
f = open("items.txt")
start_urls = [url.strip() for url in f.readlines()]
f.close()
def parse(self, response):
hxs = HtmlXPathSelector(response)
events = hxs.select("//div[#class='productOverview']")
items = []
for event in events:
item = ProductCrawlItem()
item ["product"] = events.select("table/tbody/tr/td[#class='productTitle']/a/text()").extract()
item ["price"] = events.select("table/tbody/tr/td[#class='productPrice']/a/text()").extract()
item ["stock"] = events.select("table/tbody/tr/td[#class='productStock voorraad']/a/text()").extract()
item ["link"] = events.select("table/tbody/tr/td[#class='productTitle']/a").extract()
yield item
I have a website I'm crawling which has a white space before and after the URL
Test
Instead of crawling this:
http://www.stores.com/c/96894/
it crawls this:
http://www.store.com/c/%0A%0A/c/96894%0A%0A
Moreover, it causes an infinite loop for links that contain the same link like this:
http://www.store.com/cp/%0A%0A/cp/96894%0A%0A/cp/96894%0A%0A
Any white space(\r, \n, \t and space) before and after the the URL is ignored by all browsers. How do I go about trimming the whitespace of the crawled URLs?
Here's my code.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class StoreSpider(CrawlSpider):
name = "cpages"
allowed_domains = ["www.store.com"]
start_urls = ["http://www.sore.com",]
rules = (
Rule (SgmlLinkExtractor(allow=('/c/', ),deny=('grid=false', 'sort=', 'stores=', '\|\|', 'page=',))
, callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=(),deny=('grid=false', 'sort=', 'stores=', '\|\|', 'page='))),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['anchor'] = response.meta.get('link_text')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items
I used process_value=cleanurl in my LinkExtractor instance
def cleanurl(link_text):
return link_text.strip("\t\r\n ")
The code if anyone runs into the same problem:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class storeSpider(CrawlSpider):
name = "cppages"
allowed_domains = ["www.store.com"]
start_urls = ["http://www.store.com",]
def cleanurl(link_text):
return link_text.strip("\t\r\n '\"")
rules = (
Rule (SgmlLinkExtractor(allow=('/cp/', ),deny=('grid=false', 'sort=', 'stores=', r'\|\|', 'page=',), process_value=cleanurl)
, callback="parse_items", follow= True, process_links=lambda links: [link for link in links if not link.nofollow],),
Rule(SgmlLinkExtractor(allow=('/cp/', '/browse/', ),deny=('grid=false', 'sort=', 'stores=', r'\|\|', 'page='), process_value=cleanurl)),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['anchor'] = response.meta.get('link_text')
item['canonical'] = site.xpath('//head/link[#rel="canonical"]/#href').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items
You can replace the white space with '' like,
url = response.url
item['url'] = url.replace(' ', '')
Or, using regular expression,
import re
url = response.url
item['url'] = re.sub(r'\s', '', url)
I am trying to crawl a website and parse only from pages with meta noindex.
What is happening is that the crawler crawls the first level, but finishes with the first page. It does not seem to follow the links.
The following is my code:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[#content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
the original _response_downloaded calls _parse_response function that besides calling callback function also follow links, from scrapy code:
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
you can add that follow link part though I believe it's not the best way to go (leading _ may imply just that), why not just check for meta in the beginning of your parse_items function? and if you don't want to repeat this test maybe even write a python decorator.
I believe checking for the meta at the beginning of my parse_items as #Guy Gavriely suggested will be my best option. I will test out the following code below to see.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
Working code update, I needed to return items instead of yield:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items
I have a file which has a list of domains. I need to crawl the domain(i.e. the whole website) to get rss links. Recursively crawl each page of the website to get rss links from each page and write to a json file corresponding to the domain This is my code just for one website:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
#rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
sites = sel.select('/html/head/link[#type=application/rss+xml]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = sites
#items.append(item)
return item
tried running
scrapy crawl apple -o items.json -t json
But items.json only contains a bracket [
This is my items.py file:
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
Your XPath expression needs to have quotes around the "application/rss+xml" test value.
Try something like:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
class AppleItem(Item):
reference_link = Field()
rss_link = Field()
class AppleSpider(CrawlSpider):
name = 'apple'
allowed_domains = ['apple.com']
start_urls = ['http://apple.com']
rules = [Rule(SgmlLinkExtractor(allow=()), follow=True, callback='parse_item')]
def parse_item(self, response):
sel = HtmlXPathSelector(response)
rsslinks = sel.select('/html/head/link[#type="application/rss+xml"]/#href').extract()
#items = []
item = AppleItem()
item['reference_link'] = response.url
item['rss_link'] = rsslinks
#items.append(item)
return item