I have written a crawler to extract links and text from a webpage. This is the structure of the content
DIV
- UL
- LI
- a
Here is my code
from scrapy import Spider
from scrapy.selector import Selector
from stack.items import StackItem
class StackSpider(Spider):
name = "stack"
allowed_domains = ["stackoverflow.com"]
start_urls = [
"http://page.com",
]
def parse(self, response):
documents = Selector(response).xpath('//*[#id="node-329"]/div[1]/ul/li')
for document in documents:
item = StackItem()
item['title'] = document.xpath('./a/text()').extract()
item['link'] = document.xpath('/a/#href').extract()
yield item
Basically, the tag /a/#href is not working. If I comment that and try to extract only the text, it works.
Please help me.
Related
I want to scrape the mobile product features from snapdeal.
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[1]
these are the xpaths. I can see the results via scraper extention in google chrome.But cant fetch the results through scrapy.
from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from demo.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craigs"
allowed_domains = ["www.snapdeal.com"]
start_urls = ["http://www.snapdeal.com/product/samsung-galaxy-j2-8gb/655619199985"]
def parse(self, response):
# hxs = HtmlXPathSelector(response)
sel = Selector(response)
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]")
print titles
items = []
for titles in titles:
item = CraigslistSampleItem()
# item["Brand"] = titles.extract()
items.append(item)
print items
titles is prints empty, this is the sample code.
Edit your xpath as:
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tr/td/table/tr/td[2]")
This happens because chrome adds extra tbody tag in the source code.
I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
Rule(SgmlLinkExtractor(allow=[r'display_story.php\?\id\=\d+']),callback='parse_page',follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
items.append(item)
return(items)
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item
I am trying to write program in Scrapy to open links and collect data from this tag: <p class="attrgroup"></p>.
I've managed to make Scrapy collect all the links from given URL but not to follow them. Any help is very appreciated.
You need to yield Request instances for the links to follow, assign a callback and extract the text of the desired p element in the callback:
# -*- coding: utf-8 -*-
import scrapy
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://chicago.craigslist.org/search/emd?"
]
BASE_URL = 'http://chicago.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = DmozItem()
item["link"] = response.url
item["attr"] = "".join(response.xpath("//p[#class='attrgroup']//text()").extract())
return item
I just started learning python / Scrapy. I was able to follow tutorials successfully but I am struggling with a 'test' scraping that I want to do on my own.
What I am trying to do now is go on http://jobs.walmart.com/search/finance-jobs and scrape the job listing.
However, I think I may be doing something wrong in the XPath, but I am not sure what.
There is no "id" for that table, so I am using its class.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//table[#class='tableSearchResults']")
items = []
for titles in titles:
item = walmart()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
here is what the page source looks like:
The problem as you said also, is your XPATH. It is always useful to run:
scrapy view http://jobs.walmart.com/search/finance-jobs
Before running your spider, to see how the website look like from scrapy view.
This should work now:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = walmart()
titles = hxs.select("//table[#class='tableSearchResults']/tr")
items = []
for title in titles:
if title.select("td[#class='td1']/a").extract():
item ["title"] = title.select("td[#class='td1']/a/text()").extract()
item ["link"] = title.select("td[#class='td1']/a/#href").extract()
items.append(item)
return items
I am trying to scrape Craigslist classifieds using Scrapy to extract items that are for sale.
I am able to extract date, post title, and post url but am having trouble extracting price.
For some reason the current code extracts all of the prices, but when I remove the // before the price span look up the price field returns as empty.
Can someone please review the code below and help me out?
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://longisland.craigslist.org/search/sss?sort=date&query=raptor%20660&srchType=T"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistSampleItem()
item['date'] = titles.select('span[#class="itemdate"]/text()').extract()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
item ['price'] = titles.select('//span[#class="itempp"]/text()').extract()
items.append(item)
return items
itempp appears to be inside of another element, itempnr. Perhaps it would work if you were to change //span[#class="itempp"]/text() to span[#class="itempnr"]/span[#class="itempp"]/text().