Scrapy Spider cannot Extract contents of web page using xpath - python

I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)

There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item

Related

Printing scrapy data to csv

Hi I started scrapy recently,and wrote a crawler. But when outputting the data to csv,they are all printed in a single row. How can print each data to its own row?
I my case am printing links from a website. It works well when printed in json format.
Here's the code.
The items.py file.
import scrapy
from scrapy.item import Item ,Field
class ErcessassignmentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link = Field()
#pass
The mycrawler.py
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector # deprecated
from scrapy.selector import Selector
from ercessAssignment.items import ErcessassignmentItem
class MySpider(BaseSpider):
name ="ercessSpider"
allowed_domains =["site_url"]
start_urls = ["site_url"]
def parse(self, response):
hxs = Selector(response)
links = hxs.xpath("//p")
items = []
for linkk in links:
item = ErcessassignmentItem()
item["link"] = linkk.xpath("//a/#href").extract()
items.append(item)
return items`
You should have proper indentation in code
import scrapy
from scrapy.item import Item ,Field
class ErcessassignmentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link = Field()
Then in your spider, do not use return, your for loop will run only once and you will only have 1 row printed in CSV, instead use yield
Second, where is your code to put items into CSV? I guess you are using scrapy's default way of storing items,
in case you already do not know, please run your scraper like
scrapy crawl ercessSpider -o my_output.csv
Your spider code should be like this, notice changes I made
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector # deprecated
from scrapy.selector import Selector
from ercessAssignment.items import ErcessassignmentItem
class MySpider(BaseSpider):
name ="ercessSpider"
allowed_domains =["site_url"]
start_urls = ["site_url"]
def parse(self, response):
hxs = Selector(response)
links = hxs.xpath("//p")
for linkk in links:
item = ErcessassignmentItem()
item["link"] = linkk.xpath("//a/#href").extract()
yield item
for linkk in links:
item = ErcessassignmentItem()
item["link"] = xpath("//a/#href").extract()[linkk]
yield item
this works good in css selector but if above two solutions are not working then you can try this.
Your code above does not print anything. Moreover, I don't see any .csv part. Also, your items list created in parse() will never be longer than 1 due to something that looks like an indentation error to me (i.e. you return after the first iteration of the for-loop. For better readability, you could use the for/else construct here:
def parse(self, response):
hxs = Selector(response)
links = hxs.xpath("//p")
items = []
for linkk in links:
item = ErcessassignmentItem()
item["link"] = linkk.xpath("//a/#href").extract()
items.append(item)
else: # after for loop is finished
# either return items
# or print link in items here without returning
for link in items: # take one link after another
print link # and print it in one line each

Scraping Snapdeal to extract the Mobile phone features

I want to scrape the mobile product features from snapdeal.
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[1]
these are the xpaths. I can see the results via scraper extention in google chrome.But cant fetch the results through scrapy.
from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from demo.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craigs"
allowed_domains = ["www.snapdeal.com"]
start_urls = ["http://www.snapdeal.com/product/samsung-galaxy-j2-8gb/655619199985"]
def parse(self, response):
# hxs = HtmlXPathSelector(response)
sel = Selector(response)
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]")
print titles
items = []
for titles in titles:
item = CraigslistSampleItem()
# item["Brand"] = titles.extract()
items.append(item)
print items
titles is prints empty, this is the sample code.
Edit your xpath as:
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tr/td/table/tr/td[2]")
This happens because chrome adds extra tbody tag in the source code.

scrapy - scraping a field on next page and then returning to old page

I want to scrape data from the site: http://www.consumercomplaints.in/?search=ecom-express#
I am hoping my request is quite simple and straightforward for the more experienced Scrapy users out there.
Problem: I am trying to scrape data for each review.By data,**I mean main title,subtitle ,username,date and review. ** But what i am not able to get the review ,since for review what i want is to goto the link embedded with the main title and then get the whole review not the brief one on the first page and do this for each review.
My spider class:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.spider import BaseSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from consumercomplaint.items import ConsumercomplaintItem
class MySpider(BaseSpider):
name = "consumer"
allowed_domains = ["http://www.consumercomplaints.in"]
start_urls = ["http://www.consumercomplaints.in/?search=ecom-express&page=11"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//table[#width="100%"]')
print titles
items = []
del(titles[0])
for i in titles:
item = ConsumercomplaintItem()
item ["maintitle"] = i.select('.//a[1]//text()').extract()
item ["username"] = i.select('.//td[#class="small"]//a[2]/text()').extract()
item["date"]=i.select('.//td[#class="small"]/text()').extract()
item["subtitle"]=i.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
item["complaint"]=i.select('.//td[#class="compl-text"]/div/text()').extract()
items.append(item)
return items
My item class:
from scrapy.item import Item, Field
class ConsumercomplaintItem(Item):
maintitle = Field()
username = Field()
date = Field()
subtitle = Field()
complaint = Field()
I would do it in two phases:
a) save partial data into item
b) extract link of the full complaint
c) create a new request and save your item into request.meta
d) yield request
a) extract full complaint
b) extract item from meta
c) save complaint into item's field
d) yield item
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//table[#width="100%"]')
print titles
items = []
del(titles[0])
for i in titles:
item = ConsumercomplaintItem()
item ["maintitle"] = i.select('.//a[1]//text()').extract()
item ["username"] = i.select('.//td[#class="small"]//a[2]/text()').extract()
item["date"]=i.select('.//td[#class="small"]/text()').extract()
item["subtitle"]=i.select('.//td[#class="compl-text"]/div/b[1]/text()').extract()
complaint_link = row.xpath('//complaint/link/a/#href').extract_first()
complaint_page = response.urljoin(complaint_link)
request = scrapy.Request(cve_page, callback=self.parse_complaint)
request.meta['item'] = item
yield request
def parse_complaint(self, response):
item = response.meta['item']
item['complaint'] = response.xpath('/complaint/path/text()').extract_first()
yield item

HTMLXPathSelector for Scrappy returning null results

I just started learning python / Scrapy. I was able to follow tutorials successfully but I am struggling with a 'test' scraping that I want to do on my own.
What I am trying to do now is go on http://jobs.walmart.com/search/finance-jobs and scrape the job listing.
However, I think I may be doing something wrong in the XPath, but I am not sure what.
There is no "id" for that table, so I am using its class.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//table[#class='tableSearchResults']")
items = []
for titles in titles:
item = walmart()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
here is what the page source looks like:
The problem as you said also, is your XPATH. It is always useful to run:
scrapy view http://jobs.walmart.com/search/finance-jobs
Before running your spider, to see how the website look like from scrapy view.
This should work now:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = walmart()
titles = hxs.select("//table[#class='tableSearchResults']/tr")
items = []
for title in titles:
if title.select("td[#class='td1']/a").extract():
item ["title"] = title.select("td[#class='td1']/a/text()").extract()
item ["link"] = title.select("td[#class='td1']/a/#href").extract()
items.append(item)
return items

scrapy: A tiny "spider" in a spider?

So when i try to scrape product review info from epinions.com, if the main review text is too long, it has a "read more" link to another page.
I took an example from "http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1" you'll see what i mean if you look at the first review.
I am wondering: is it possible to have a tiny spider in each iteration of the for loop to grab the url and scrape the review out of the new link? I have the following code, but it doesn't work for the tiny "spider".
Here is my code:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from epinions_test.items import EpinionsTestItem
from scrapy.http import Response, HtmlResponse
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
items = []
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
item["review"] = sites.select('p/span/text()').extract()
# Everything works fine and i do have those four columns beautifully printed out, until....
url2 = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" %str(url2)[3:-2])
# This url is a string. when i print it out, it's like "http://www.epinions.com/review/samsung-galaxy-note-16-gb-cell-phone/content_624031731332", which looks legit.
response2 = HtmlResponse(url)
# I tried in a scrapy shell, it shows that this is a htmlresponse...
hxs2 = HtmlXPathSelector(response2)
fullReview = hxs2.select('//div[#class = "user_review_full"]')
item["url"] = fullReview.select('p/text()').extract()
# The three lines above works in an independent spider, where start_url is changed to the url just generated and everything.
# However, i got nothing from item["url"] in this code.
items.append(item)
return items
Why item["url"] returns nothing?
Thanks!
You should instantiate a new Request in the callback and pass your item in the meta dict:
from scrapy.http import Request
from scrapy.item import Item, Field
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class EpinionsTestItem(Item):
title = Field()
star = Field()
date = Field()
review = Field()
class MySpider(BaseSpider):
name = "epinions"
allow_domains = ["epinions.com"]
start_urls = ['http://www.epinions.com/reviews/samsung-galaxy-note-16-gb-cell-phone/pa_~1']
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//div[#class="review_info"]')
for sites in sites:
item = EpinionsTestItem()
item["title"] = sites.select('h2/a/text()').extract()
item["star"] = sites.select('span/a/span/#title').extract()
item["date"] = sites.select('span/span/span/#title').extract()
url = sites.select('p/span/a/#href').extract()
url = str("http://www.epinions.com%s" % str(url)[3:-2])
yield Request(url=url, callback=self.parse_url2, meta={'item': item})
def parse_url2(self, response):
hxs = HtmlXPathSelector(response)
item = response.meta['item']
fullReview = hxs.select('//div[#class = "user_review_full"]')
item["review"] = fullReview.select('p/text()').extract()
yield item
Also see documentation.
Hope that helps.

Categories