Scrapy crawl Multiple XPathSelector on same page

Scrapy crawl Multiple XPathSelector on same page - python

Im trying to extract data from different 'tables' inside a 'Main Table' on the same page (Same URL). The items fields have the same XPath / same structure in all sub-tables, so the problem I am facing is just to add 'Multiple' XPath for the tables sections on this page.
Here what my code looks like :
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import TutorialItem
class MySpider(BaseSpider):
name = "test"
allowed_domains = ["blabla.com"]
start_urls = ["http://www.blablabl..com"] // Start_url Doesnt change = Same Page
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = [hxs.select('//tr[#class="index class_tr group-6487"]')]
//Here I would like to have Mltiple XPathSelectors ex:
// titles = [hxs.select('//tr[#class="index class_tr group-6488"]')]
// titles = [hxs.select('//tr[#class="index class_tr group-6489"]')]
// Each for a table section within the same 'Main Table'
items = []
for title in titles:
item = TutorialItem()
item ['name'] = title.select('td[3]/span/a/text()').extract()
item ['encryption'] = title.select('td[5]/text()').extract()
item ['compression'] = title.select('td[8]/text()').extract()
item ['resolution'] = title.select('td[7]/span/text()').extract()
items.append(item)
return items
I would appreciate any hint if this is achievable; If I write a different spider for each table section, then I will end up with 10 spiders for the same URL/table and I am not quite sure if data could be retrieved within the same 'csv' file in order.

Try this:
titles = [hxs.select('//tr[#class="index class_tr group-6487"] | //tr[#class="index class_tr group-6488"] | //tr[#class="index class_tr group-6489"]')]

Related

Scrapy yield only last data and merge scrapy data into one

I am scraping some news website with scrapy framework, it seems only store the last item scraped and repeated in loop
I want to store the Title,Date,and Link, which i scrape from the first page
and also store the whole news article. So i want to merge the article which stored in a list into a single string.
Item code
import scrapy
class ScrapedItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
source = scrapy.Field()
date = scrapy.Field()
paragraph = scrapy.Field()
Spider code
import scrapy
from ..items import ScrapedItem
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
box_text = response.xpath("//ul/li/div[#class='ket']")
items = ScrapedItem()
for crawl in box_text:
title = crawl.css("h1 a::text").extract()
source ="https://investasi.kontan.co.id"+(crawl.css("h1 a::attr(href)").extract()[0])
date = crawl.css("span.font-gray::text").extract()[0].replace("|","")
items['title'] = title
items['source'] =source
items['date'] = date
yield scrapy.Request(url = source,
callback=self.parseparagraph,
meta={'item':items})
def parseparagraph(self, response):
items_old = response.meta['item'] #only last item stored
paragraph = response.xpath("//p/text()").extract()
items_old['paragraph'] = paragraph #merge into single string
yield items_old
I expect the output that the Date,Title,and Source can be updated through the loop.
And the article can be merged into single string to be stored in mysql

I defined an empty dictionary and put those variables within it. Moreover, I've brought about some minor changes in your xpaths and css selectors to make them less error prone. The script is working as desired now:
import scrapy
class CBNCSpider(scrapy.Spider):
name = 'kontan'
start_urls = [
'https://investasi.kontan.co.id/rubrik/28/Emiten'
]
def parse(self, response):
for crawl in response.xpath("//*[#id='list-news']//*[#class='ket']"):
d = {}
d['title'] = crawl.css("h1 > a::text").get()
d['source'] = response.urljoin(crawl.css("h1 > a::attr(href)").get())
d['date'] = crawl.css("span.font-gray::text").get().strip("|")
yield scrapy.Request(
url=d['source'],
callback=self.parseparagraph,
meta={'item':d}
)
def parseparagraph(self, response):
items_old = response.meta['item']
items_old['paragraph'] = response.xpath("//p/text()").getall()
yield items_old

Scraping Snapdeal to extract the Mobile phone features

I want to scrape the mobile product features from snapdeal.
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]
//*[#id="productSpecs"]/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[1]
these are the xpaths. I can see the results via scraper extention in google chrome.But cant fetch the results through scrapy.
from scrapy.spider import BaseSpider
# from scrapy.selector import HtmlXPathSelector
from scrapy.selector import Selector
from demo.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craigs"
allowed_domains = ["www.snapdeal.com"]
start_urls = ["http://www.snapdeal.com/product/samsung-galaxy-j2-8gb/655619199985"]
def parse(self, response):
# hxs = HtmlXPathSelector(response)
sel = Selector(response)
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tbody/tr/td/table/tbody/tr/td[2]")
print titles
items = []
for titles in titles:
item = CraigslistSampleItem()
# item["Brand"] = titles.extract()
items.append(item)
print items
titles is prints empty, this is the sample code.

Edit your xpath as:
titles = sel.xpath("//*[#id='productSpecs']/div/div[2]/div[2]/div/table[1]/tr/td/table/tr/td[2]")
This happens because chrome adds extra tbody tag in the source code.

HTMLXPathSelector for Scrappy returning null results

I just started learning python / Scrapy. I was able to follow tutorials successfully but I am struggling with a 'test' scraping that I want to do on my own.
What I am trying to do now is go on http://jobs.walmart.com/search/finance-jobs and scrape the job listing.
However, I think I may be doing something wrong in the XPath, but I am not sure what.
There is no "id" for that table, so I am using its class.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//table[#class='tableSearchResults']")
items = []
for titles in titles:
item = walmart()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
items.append(item)
return items
here is what the page source looks like:

The problem as you said also, is your XPATH. It is always useful to run:
scrapy view http://jobs.walmart.com/search/finance-jobs
Before running your spider, to see how the website look like from scrapy view.
This should work now:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
class MySpider(BaseSpider):
name = "walmart"
allowed_domains = ["jobs.walmart.com"]
start_urls = ["http://jobs.walmart.com/search/finance-jobs"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
item = walmart()
titles = hxs.select("//table[#class='tableSearchResults']/tr")
items = []
for title in titles:
if title.select("td[#class='td1']/a").extract():
item ["title"] = title.select("td[#class='td1']/a/text()").extract()
item ["link"] = title.select("td[#class='td1']/a/#href").extract()
items.append(item)
return items

Scraping with scrapy

I am trying to dig a little deeper with scrapy, but can only get the title of what i am scrapying and not any of the details. Here is the code that I have so far:
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//div[#class='magicCard']")
vendor = hxs.select("//tr[#class='vendor']")
items = []
for titles in titles:
item = Tcgplayer1Item()
item ["cardname"] = titles.select("//li[#class='cardName']/a/text()").extract()
item ["price"] = vendor.select("//td[#class='price']/br/text()").extract()
item ["quantity"] = vendor.select("//td[#class='quantity']/td/text()").extract()
items.append(item)
return items
I cannot get the price and quantity to show any results. Each card has several vendors each with their own prices and quantities. I think that is where i am having problems. Any help will be greatly appreciated.

First of all, here's the fixed version of the code:
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from tcgplayer1.items import Tcgplayer1Item
class MySpider(BaseSpider):
name = "tcg"
allowed_domains = ["http://www.tcgplayer.com/"]
start_urls = ["http://store.tcgplayer.com/magic/journey-into-nyx?PageNumber=1"]
def parse(self, response):
hxs = Selector(response)
titles = hxs.xpath("//div[#class='magicCard']")
for title in titles:
item = Tcgplayer1Item()
item["cardname"] = title.xpath(".//li[#class='cardName']/a/text()").extract()[0]
vendor = title.xpath(".//tr[#class='vendor ']")
item["price"] = vendor.xpath("normalize-space(.//td[#class='price']/text())").extract()
item["quantity"] = vendor.xpath("normalize-space(.//td[#class='quantity']/text())").extract()
yield item
There were multiple issues with the code:
the vendor class name needs to contain a trailing space: "vendor " - it was tricky to find
there are multiple vendors per-item - you need to define vendor inside the loop
you are redefining titles variable in the loop
xpath expressions in the loop should be relative .//
use Selector instead of deprecated HtmlXPathSelector
use xpath() instead of deprecated select()
use normalize-space() to eliminate new-lines and extra spaces in price and quantity xpaths

First, you could change
item ["price"] = vendor.select("//td[#class='price']/br/text()").extract()
item ["quantity"] = vendor.select("//td[#class='quantity']/td/text()").extract()
To:
item ["price"] = titles.select("//td[#class='price']/br/text()").extract()
item ["quantity"] = titles.select("//td[#class='quantity']/td/text()").extract()
This will ensure you are only getting price and quantity rows for the card you want.
You may also have to remove the /br and /td from the selectors, so your code would look like this:
item ["price"] = titles.select("//td[#class='price']/text()").extract()
item ["quantity"] = titles.select("//td[#class='quantity']/text()").extract()

Scrapy Python Craigslist Scraper

I am trying to scrape Craigslist classifieds using Scrapy to extract items that are for sale.
I am able to extract date, post title, and post url but am having trouble extracting price.
For some reason the current code extracts all of the prices, but when I remove the // before the price span look up the price field returns as empty.
Can someone please review the code below and help me out?
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from craigslist_sample.items import CraigslistSampleItem
class MySpider(BaseSpider):
name = "craig"
allowed_domains = ["craigslist.org"]
start_urls = ["http://longisland.craigslist.org/search/sss?sort=date&query=raptor%20660&srchType=T"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistSampleItem()
item['date'] = titles.select('span[#class="itemdate"]/text()').extract()
item ["title"] = titles.select("a/text()").extract()
item ["link"] = titles.select("a/#href").extract()
item ['price'] = titles.select('//span[#class="itempp"]/text()').extract()
items.append(item)
return items

itempp appears to be inside of another element, itempnr. Perhaps it would work if you were to change //span[#class="itempp"]/text() to span[#class="itempnr"]/span[#class="itempp"]/text().

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Scrapy crawl Multiple XPathSelector on same page - python

Try this: titles = [hxs.select('//tr[#class="index class_tr group-6487"] | //tr[#class="index class_tr group-6488"] | //tr[#class="index class_tr group-6489"]')]

Related

Scrapy yield only last data and merge scrapy data into one

Scraping Snapdeal to extract the Mobile phone features

HTMLXPathSelector for Scrappy returning null results

Scraping with scrapy

Scrapy Python Craigslist Scraper

Categories

Resources