Scrapy - how to join data together from different parts of a website - python

I am in the process of building a crawler. Now, I want it to navigate all available pages on the site, and [i] fill a number of data fields for each product, and [ii], for each product, drill into the corresponding product url, and populate a number of other data fields. I want all of the data in the same {} for each product. But instead, what the crawler is doing is carrying out [i], and then [ii], so that part [ii] is populated in a separate {}.
I want to somehow add data [i] into [ii]. request.meta['item'] = item looks to be something which could work, but I have not yet succeeded in getting it to work.
I have the following code:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlerItem
class Crawler1Spider(CrawlSpider):
name = "crawler1"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
#visit each page
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
#click on each product link
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="exhib_status exhib_status_interiors"]')), callback='parse_detail', follow=True),
]
def parse_item(self, response):
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
results = []
n = 0
for element in elements:
item = CrawlerItem()
n = n + 1
#work out how to put images into image folder
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
yield item
#items.append(item)
#return items
def parse_detail(self, response):
item = CrawlerItem()
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
Suggestion as to how I can get all the data into one {} for each product would be much appreciated.
UPDATE: 20/11/15
I have amended the code as follows:
# -*- coding: utf-8 -*-
import scrapy
import re
from scrapy import Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
from scrapy.http import Request
from maxine.items import CrawlItem
class Crawler1Spider(CrawlSpider):
name = "test"
allowed_domains = ["website.com"]
start_urls = (
'starturl.com',
)
rules = [
Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="listnavpagenum"]')), callback='parse_item', follow=True),
]
def parse_item(self, response):
item = CrawlItem()
sel = Selector(response)
elements = sel.xpath('//div[#class="ez_listitem_wrapper"]')
items = []
n = 0
for element in elements:
n = n + 1
#work out how to put images into image folder
#item['image_urls'] = selector.xpath('//a[#class="exhib_status exhib_status_interiors"]/img/#src').extract()
item['title'] = element.css('a.exhib_status.exhib_status_interiors').xpath('text()').extract_first()
item['title_code'] = element.xpath('.//div[#class="ez_merge8"]/text()').extract_first()
item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
item['count'] = n
item_detail_url = item['item_url'] = element.xpath('//div[#class="ez_merge4"]/a/#href').extract_first()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail,meta=dict(item=item))
def parse_detail(self, response):
#get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
item['telephone'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('[0-9]{4,}\s*[0-9]{4,}')
item['website'] = response.xpath('//div[#id="ez_entry_contactinfo"]//text()').re('(?:http://)?www.[a-z0-9\/?_\- ]+.[0-9a-z]+')
yield item
I'm getting the data in the same {}'s, however, the robot is only extracting data from the last item per page. Any further suggestions?

I am afraid you can't use rules for this case, as every request is independent when they reach the site you want to crawl.
You'll need to define your own behaviour from start_requests:
def start_requests(self):
yield Request(url=myinitialurl, callback=self.parse)
def parse(self, response):
# crawl the initial page and then do something with that info
yield Request(url=producturl, callback=self.parse_item)
def parse_item(self, response):
item = CrawlerItem()
# crawl the item and pass the item to the following request with *meta*
yield Request(url=item_detail_url, callback=self.parse_detail, meta=dict(item=item))
def parse_detail(self, response):
# get the item from the previous passed meta
item = response.meta['item']
# keep populating the item
yield item

try instantiating item = CrawlItem() within the for loop in parse_item.

Related

Getting data from multiple links using scrapy

I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.

Scrapy Spider cannot Extract contents of web page using xpath

I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item

I want Scrapy to run through each item once

I would like Scrapy to run through each item once so that relevant data is grouped together. As it is it just puts all links, headers, dates etc together. It is also posting everything to the file more than once. I am pretty new to both Scrapy and Python so any advice I would be grateful for.
Here is my spider code:
from scrapy.spiders import Spider
from scrapy.selector import Selector
from fashioBlog.functions import extract_data
from fashioBlog.items import Fashioblog
class firstSpider(Spider):
name = "first"
allowed_domains = [
"stopitrightnow.com"
]
start_urls = [
"http://www.stopitrightnow.com"
]
def parse(self, response):
sel = Selector(response)
sites = sel.xpath('//div[#class="post-outer"]')
items= []
for site in sites:
item = Fashioblog()
item['title'] = extract_data(site.xpath('//h3[normalize-space(#class)="post-title entry-title"]//text()').extract())
item['url'] = extract_data(site.xpath('//div[normalize-space(#class)="post-body entry-content"]//#href').extract())
item['date'] = extract_data(site.xpath('//h2[normalize-space(#class)="date-header"]/span/text()').extract())
#item['body'] = site.xpath('//div[#class="post-body entry-content"]/i/text()').extract()
item['labelLink'] = extract_data(site.xpath('//span[normalize-space(#class)="post-labels"]//#href').extract())
item['comment'] = extract_data(site.xpath('//span[normalize-space(#class)="post-comment-link"]//text()').extract())
item['picUrl'] = extract_data(site.xpath('//div[normalize-space(#class)="separator"]//#href').extract())
#item['labelText'] = extract_data(site.xpath('(//i//text()').extract())
#item['labelLink2'] = extract_data(site.xpath('(//i//#href').extract())
yield item
Make your expressions context-specific by prepending a dot:
item['title'] = extract_data(site.xpath('.//h3[normalize-space(#class)="post-title entry-title"]//text()').extract())
^ HERE

Using Regex to Select Index Page Only

Is there regex to select on the index page when crawling a specific website? I'm select certain pages but also need just the index page on top of those.
I can't seem to figure out the proper way to put it. Basically, I want to crawl index page, contact page, about page, and advertise page to look for contact information.
Here is the code.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.item import Item, Field
import csv
from urlparse import urlparse
class MailItem(Item):
desc = Field()
title = Field()
url = Field()
mail = Field()
class MailSpider(CrawlSpider):
name = "marksey"
parsed_hostnames= set()
rules = [
Rule(SgmlLinkExtractor(allow=(r'/contact',r'/about',r'/advertise',)), callback='parse_item', follow=True)
]
###r'^https?://[^/]+(/(\?.*|index\.php(\?.*)?)?)?$',
start_urls = []
allowed_domains = []
with open('C:\Users\Vasily\MyStuff\emailtest\emailtest\scraped_data.csv', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
next(reader)
for row in reader:
url = row[0].strip()
if (url.strip() != ""):
start_urls.append(url)
hostname = urlparse(url).hostname
allowed_domains.append(hostname)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
for sel in response.xpath('//html/head'):
item = MailItem()
item['title'] = sel.xpath('title/text()').extract()
item['desc'] = sel.xpath('//meta[#name=\'description\']/#content').extract()
item['url'] = response.url
item['mail'] = hxs.select('//body//text()').re(r'[\w.-]+#[\w.-]+')
if not item['mail']:
item['mail'] = item['url']
items.append(item)
hostname = urlparse(response.url).hostname
self.parsed_hostnames.add(hostname)
return items
def process_links(self, links):
return [l for l in links if urlparse(l.url).hostname not in self.parsed_hostnames]
What you need to do is to call parse_item() callback from the parse_start_url() - this way you would also parse the url coming from start_urls, which is I am assuming is an index page:
class MailSpider(CrawlSpider):
...
def parse_start_url(self, response):
return self.parse_item(response)
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
items = []
...
See also:
Scrapy CrawlSpider doesn't crawl the first landing page

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

Categories