Using Regex to Select Index Page Only - python

Is there regex to select on the index page when crawling a specific website? I'm select certain pages but also need just the index page on top of those.
I can't seem to figure out the proper way to put it. Basically, I want to crawl index page, contact page, about page, and advertise page to look for contact information.
Here is the code.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[#name=\'description\']/#content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]

What you need to do is to call parse_item() callback from the parse_start_url() - this way you would also parse the url coming from start_urls, which is I am assuming is an index page:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
See also:
Scrapy CrawlSpider doesn't crawl the first landing page

Related

Scrapy - how to join data together from different parts of a website

I am in the process of building a crawler. Now, I want it to navigate all available pages on the site, and [i] fill a number of data fields for each product, and [ii], for each product, drill into the corresponding product url, and populate a number of other data fields. I want all of the data in the same {} for each product. But instead, what the crawler is doing is carrying out [i], and then [ii], so that part [ii] is populated in a separate {}.
I want to somehow add data [i] into [ii]. request.meta['item'] = item looks to be something which could work, but I have not yet succeeded in getting it to work.
I have the following code:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlerItem
class Crawler1Spider(CrawlSpider):
name = "crawler1"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
#visit each page
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
#click on each product link
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="exhib_status exhib_status_interiors"]')), callback='parse_detail', follow=True),
]
def parse_item(self, response):
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
results = []
n = 0
for element in elements:
item = CrawlerItem()
n = n + 1
#work out how to put images into image folder
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
yield item
#items.append(item)
#return items
def parse_detail(self, response):
item = CrawlerItem()
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
Suggestion as to how I can get all the data into one {} for each product would be much appreciated.
UPDATE: 20/11/15
I have amended the code as follows:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlItem
class Crawler1Spider(CrawlSpider):
name = "test"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
]
def parse_item(self, response):
item = CrawlItem()
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
n = 0
for element in elements:
n = n + 1
#work out how to put images into image folder
#item['image_urls'] = selector.xpath('//a[#class="exhib_status exhib_status_interiors"]/img/#src').extract()
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
item_detail_url = item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail,meta=dict(item=item))
def parse_detail(self, response):
#get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
I'm getting the data in the same {}'s, however, the robot is only extracting data from the last item per page. Any further suggestions?
I am afraid you can't use rules for this case, as every request is independent when they reach the site you want to crawl.
You'll need to define your own behaviour from start_requests:
def start_requests(self):
yield Request(url=myinitialurl, callback=self.parse)
def parse(self, response):
# crawl the initial page and then do something with that info
yield Request(url=producturl, callback=self.parse_item)
def parse_item(self, response):
item = CrawlerItem()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail, meta=dict(item=item))
def parse_detail(self, response):
# get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
yield item
try instantiating item = CrawlItem() within the for loop in parse_item.

Scrapy Spider cannot Extract contents of web page using xpath

I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item

scrapy - scraping a field on next page and then returning to old page

I want to scrape data from the site: http://www.consumercomplaints.in/?search=ecom-express#
I am hoping my request is quite simple and straightforward for the more experienced Scrapy users out there.
Problem: I am trying to scrape data for each review.By data,**I mean main title,subtitle ,username,date and review. ** But what i am not able to get the review ,since for review what i want is to goto the link embedded with the main title and then get the whole review not the brief one on the first page and do this for each review.
My spider class:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from consumercomplaint.items import ConsumercomplaintItem
class MySpider(BaseSpider):
name = "consumer"
allowed_domains = ["http://www.consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=ecom-express&page=11"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//table[#width="100%"]')
print titles
items = []
del(titles[0])
for i in titles:
item = ConsumercomplaintItem()
item ["maintitle"] = i.select('.//a[1]//text()').extract()
item ["username"] = i.select('.//td[#class="small"]//a[2]/text()').extract()
item["date"]=i.select('.//td[#class="small"]/text()').extract()
item["subtitle"]=i.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
item["complaint"]=i.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
return items
My item class:
from scrapy.item import Item, Field
class ConsumercomplaintItem(Item):
maintitle = Field()
username = Field()
date = Field()
subtitle = Field()
complaint = Field()
I would do it in two phases:
a) save partial data into item
b) extract link of the full complaint
c) create a new request and save your item into request.meta
d) yield request
a) extract full complaint
b) extract item from meta
c) save complaint into item's field
d) yield item
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//table[#width="100%"]')
print titles
items = []
del(titles[0])
for i in titles:
item = ConsumercomplaintItem()
item ["maintitle"] = i.select('.//a[1]//text()').extract()
item ["username"] = i.select('.//td[#class="small"]//a[2]/text()').extract()
item["date"]=i.select('.//td[#class="small"]/text()').extract()
item["subtitle"]=i.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
complaint_link = row.xpath('//complaint/link/a/#href').extract_first()
complaint_page = response.urljoin(complaint_link)
request = scrapy.Request(cve_page, callback=self.parse_complaint)
request.meta['item'] = item
yield request
def parse_complaint(self, response):
item = response.meta['item']
item['complaint'] = response.xpath('/complaint/path/text()').extract_first()
yield item

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

How to tell python scrapy to move to the next start URL

I wrote a scrapy spider that has many start_urls and extracts email adresses in these urls. The script takes ages to execute so I want to tell Scrapy to stop crawling a particular site when it finds an email and move to the next site.
EDIT: added code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item
import csv
from urlparse import urlparse
from entreprise.items import MailItem
class MailSpider(CrawlSpider):
name = "mail"
start_urls = []
allowed_domains = []
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if (url.strip() != ""):
start_urls.append(url)
fragments = urlparse(url).hostname.split(".")
hostname = ".".join(len(fragments[-2]) < 4 and fragments[-3:] or fragments[-2:])
allowed_domains.append(hostname)
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for mail in hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+'):
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
return items
The idea is to use start_requests method to decide what urls to crawl next. Additionally, we'll keep track if an email was parsed for the hostname in parsed_hostnames class-level set.
Also, I've changed the way you get the hostname from url, using urlparse now.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "mail"
parsed_hostnames= set()
allowed_domains = []
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item')
]
def start_requests(self):
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if url:
hostname = urlparse(url).hostname
if hostname not in self.parsed_hostnames:
if hostname not in self.allowed_domains:
self.allowed_domains.append(hostname)
self.rules[0].link_extractor.allow_domains.add(hostname)
self.rules[1].link_extractor.allow_domains.add(hostname)
yield self.make_requests_from_url(url)
else:
self.allowed_domains.remove(hostname)
self.rules[0].link_extractor.allow_domains.remove(hostname)
self.rules[1].link_extractor.allow_domains.remove(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for mail in hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+'):
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
Should work in theory. Hope that helps.
I ended using process_links
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "mail"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=('.+')), follow=True, callback='parse_item', process_links='process_links'),
Rule(SgmlLinkExtractor(allow=('.+')), callback='parse_item', process_links='process_links')
]
start_urls = []
allowed_domains = []
with open('scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[5].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
mails = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if mails:
for mail in mails:
item = MailItem()
item['url'] = response.url
item['mail'] = mail
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]

Categories