I wrote this spider to scrape reviews of apps from google play. I am partially successful in this. I am able to extract the name, date, and review only.
My questions:
How to get all the reviews as I am only getting only 41.
How to get the rating from the <div>?
import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from urlparse import urljoin
class CompItem(scrapy.Item):
rating = scrapy.Field()
data = scrapy.Field()
name = scrapy.Field()
date = scrapy.Field()
class criticspider(CrawlSpider):
name = "gaana"
allowed_domains = ["play.google.com"]
start_urls = ["https://play.google.com/store/apps/details?id=com.gaana&hl=en"]
# rules = (
# Rule(
# SgmlLinkExtractor(allow=('search=jabong&page=1/+',)),
# callback="parse_start_url",
# follow=True),
# )
def parse(self, response):
sites = response.xpath('//div[#class="single-review"]')
items = []
for site in sites:
item = CompItem()
item['data'] = site.xpath('.//div[#class="review-body"]/text()').extract()
item['name'] = site.xpath('.//div/div/span[#class="author-name"]/a/text()').extract()[0]
item['date'] = site.xpath('.//span[#class="review-date"]/text()').extract()[0]
item['rating'] = site.xpath('div[#class="review-info-star-rating"]/aria-label/text()').extract()
return items
you have
item['rating'] = site.xpath('div[#class="review-info-star-rating"]/aria-label/text()').extract()
should it not be something like:
item['rating'] = site.xpath('.//div[#class="review-info-star-rating"]/aria-label/text()').extract()
?? dunno if it will work, but try :)
You can try this one out:
item['rating'] = site.xpath('.//div[#class="tiny-star star-rating-non-editable-container"]/#aria-label').extract()
I have to scrape data(name, price, description, brand,...) on this website: https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww%7Cnew+in%7Cnew+products%7Cclothing
My code is as such:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
def remove_characters(self,value):
return value.strip('\n')
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
def parse_item(self, response):
'about_me': response.xpath("//div[#class='about-me']//text()").getall(),
However, due to javascript I cannot get the Price. I need to get it thorugh XHR.
My code for getting the price of only one item in the list is as followed:
import scrapy
import json
class AsosSpider(scrapy.Spider):
name = 'asos'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=200369183&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28']
def parse(self, response):
resp = json.loads(response.text)[0]
price = resp.get('productPrice').get('current').get('text')
yield {
'price': price
Here, my start_urls is the Request URL. And it keeps changing for each item.
Item1: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=23443988&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28
Item2: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=22495685&store=ROW¤cy=GBP&keyStoreDataversion=hnm9sjt-28
Only the productsIds are changing!!!
I need to insert the second code in the first code to get the price as well? How to do it please?
import scrapy
class AsosItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
about_me = scrapy.Field()
brand_description = scrapy.Field()
As I said in you last post I have a problem with this website on my computer for some reason, but you need to do something like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import AsosItem
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
def remove_characters(self,value):
return value.strip('\n')
def parse_item(self, response):
price_url = 'https://www.asos.com' + re.search(r'window.asos.pdp.config.stockPriceApiUrl = \'(.+)\'', response.text).group(1)
item = AsosItem()
item['name'] = response.xpath("//div[#class='product-hero']/h1/text()").get()
item['description'] = response.xpath("//div[#class='product-description']/ul/li/text()").getall()
item['about_me'] = response.xpath("//div[#class='about-me']//text()").getall()
item['brand_description'] = response.xpath("//div[#class='brand-description']/p/text()").getall()
request = scrapy.Request(url=price_url, callback=self.parse_price)
request.meta['item'] = item
return request
def parse_price(self, response):
jsonresponse = response.json()[0]
price = jsonresponse['productPrice']['current']['text']
item = response.meta['item']
item['price'] = price
return item
Test the code and if it doesn't work then get the general idea and tweak it a bit, I can't test it myself.
I am trying to scrape data of # pages. I have already done a scraper which can scrape data from a single # page. But it suddenly finished the work after scraping of the first page
The whole file with parse function and scrapd function - Scraper.py
# -*- coding: utf-8 -*-
import scrapy
import csv
import os
from scrapy.selector import Selector
from scrapy import Request
class Proddduct(scrapy.Item):
price = scrapy.Field()
description = scrapy.Field()
link = scrapy.Field()
content = scrapy.Field()
class LapadaScraperSpider(scrapy.Spider):
name = 'lapada_scraper2'
allowed_domains = ['http://www.lapada.org']
start_urls = ['https://lapada.org/art-and-antiques/?search=antique']
def parse(self, response):
next_page_url = response.xpath("//ul/li[#class='next']//a/#href").get()
for item in self.scrape(response):
yield item
if next_page_url:
print("Found url: {}".format(next_page_url))
yield scrapy.Request(url=next_page_url, callback=self.parse)
def scrape(self, response):
parser = scrapy.Selector(response)
products = parser.xpath("//div[#class='content']")
for product in products:
item = Proddduct()
XPATH_PRODUCT_DESCRIPTION = ".//strong/text()"
XPATH_PRODUCT_PRICE = ".//div[#class='price']/text()"
XPATH_PRODUCT_LINK = ".//a/#href"
raw_product_description = product.xpath(XPATH_PRODUCT_DESCRIPTION).extract()
raw_product_price = product.xpath(XPATH_PRODUCT_PRICE).extract()
raw_product_link = product.xpath(XPATH_PRODUCT_LINK).extract_first()
item['description'] = raw_product_description
item['price'] = raw_product_price
item['link'] = raw_product_link
yield item
def get_information(self, response):
item = response.meta['item']
item['phonenumber'] = "12345"
yield item
How can I scrape all items in all pages?
Change allowed_domains = ['http://www.lapada.org'] to allowed_domains = ['lapada.org']
My Hacker News spider outputs all the results on one line, instead of one each line, as it can be seen here.
All on the same line
Here is my code.
import scrapy
import string
import urlparse
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors import LinkExtractor
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = response
selector_list = response.xpath('.//table[#class="itemlist"]')
for sel in selector_list:
item = HnItem()
item['title'] = sel.xpath('.//td[#class="title"]/text()').extract()
item['link'] = sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
item['score'] = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
yield item
and my settings.py file
BOT_NAME = 'hnews'
SPIDER_MODULES = ['hnews.spiders']
NEWSPIDER_MODULE = 'hnews.spiders'
USER_AGENT = 'hnews (+http://www.yourdomain.com)'
FEED_URI = '/used/scrapy/hnews/%(name)s/%(time)s.csv'
I've tried to implement this among many other solutions but no luck so far. I'm still very new at this, so bear with me if possible.
It is happening because your item pipeline is getting all the lists at once. For expample: The item['title'] is getting a list of all the titles at once which is then transferred to the item pipeline and then written to the csv file directly.
The solution is to iterate over the list and yield it to the item pipeline one at a time. Here's a modified code:
import scrapy
from scrapy.selector import Selector
class HnItem(scrapy.Item):
title = scrapy.Field()
link = scrapy.Field()
score = scrapy.Field()
class HnSpider(scrapy.Spider):
name = 'hackernews'
allowed_domains = ["news.ycombinator.com"]
start_urls = ["https://news.ycombinator.com/"]
def parse(self, response):
sel = Selector(response)
item = HnItem()
title_list = sel.xpath('.//td[#class="title"]/a/text()').extract()[:-2]
link_list= sel.xpath('.//tr[#class="athing"]/td[3]/a/#href').extract()
score_list = sel.xpath('.//td[#class="subtext"]/span/text()').extract()
for x in range(0,len(title_list)):
item['title'] = title_list[x]
item['link'] = link_list[x]
item['score'] = score_list[x]
yield item
I have scrapy spider and i am using xpath selectors to extract the contents of the page,kindly check where i am going wrong
from scrapy.contrib.loader import ItemLoader
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.selector import HtmlXPathSelector
from medicalproject.items import MedicalprojectItem
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy import Request
class MySpider(CrawlSpider):
name = "medical"
allowed_domains = ["yananow.org"]
start_urls = ["http://yananow.org/query_stories.php"]
rules = (
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.xpath('/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td')
items = []
for title in titles:
item = MedicalprojectItem()
item["patient_name"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/img[1]/text()").extract()
item["stories"] = title.xpath("/html/body/div/table/tbody/tr[2]/td/table/tbody/tr/td/div/font/p/text()").extract()
There are a lot of issues with your code so here is a different approach.
I opted against a CrawlSpider to have more control over the scraping process. Especially with grabbing the name from the query page and the story from a detail page.
I tried to simplify the XPath statements by not diving into the (nested) table structures but looking for patterns of content. So if you want to extract a story ... there must be a link to a story.
Here comes the tested code (with comments):
# -*- coding: utf-8 -*-
import scrapy
class MyItem(scrapy.Item):
name = scrapy.Field()
story = scrapy.Field()
class MySpider(scrapy.Spider):
name = 'medical'
allowed_domains = ['yananow.org']
start_urls = ['http://yananow.org/query_stories.php']
def parse(self, response):
rows = response.xpath('//a[contains(#href,"display_story")]')
#loop over all links to stories
for row in rows:
myItem = MyItem() # Create a new item
myItem['name'] = row.xpath('./text()').extract() # assign name from link
story_url = response.urljoin(row.xpath('./#href').extract()[0]) # extract url from link
request = scrapy.Request(url = story_url, callback = self.parse_detail) # create request for detail page with story
request.meta['myItem'] = myItem # pass the item with the request
yield request
def parse_detail(self, response):
myItem = response.meta['myItem'] # extract the item (with the name) from the response
text_raw = response.xpath('//font[#size=3]//text()').extract() # extract the story (text)
myItem['story'] = ' '.join(map(unicode.strip, text_raw)) # clean up the text and assign to item
yield myItem # return the item
I want to get all external links from a given website using Scrapy. Using the following code the spider crawls external links as well:
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule (LinkExtractor(), callback="parse_obj", follow=True),
def parse_obj(self,response):
item = someItem()
item['url'] = response.url
return item
What am I missing? Doesn't "allowed_domains" prevent the external links to be crawled? If I set "allow_domains" for LinkExtractor it does not extract the external links. Just to clarify: I wan't to crawl internal links but extract external links. Any help appriciated!
You can also use the link extractor to pull all the links once you are parsing each page.
The link extractor will filter the links for you. In this example the link extractor will deny links in the allowed domain so it only gets outside links.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LxmlLinkExtractor
from myproject.items import someItem
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
item = someItem()
item['url'] = link.url
An updated code based on 12Ryan12's answer,
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.item import Item, Field
class MyItem(Item):
url= Field()
class someSpider(CrawlSpider):
name = 'crawltest'
allowed_domains = ['someurl.com']
start_urls = ['http://www.someurl.com/']
rules = (Rule(LxmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def parse_obj(self,response):
item = MyItem()
item['url'] = []
for link in LxmlLinkExtractor(allow=(),deny = self.allowed_domains).extract_links(response):
return item
A solution would be make usage a process_link function in the SgmlLinkExtractor
Documentation here http://doc.scrapy.org/en/latest/topics/link-extractors.html
class testSpider(CrawlSpider):
name = "test"
bot_name = 'test'
allowed_domains = ["news.google.com"]
start_urls = ["https://news.google.com/"]
rules = (
Rule(SgmlLinkExtractor(allow_domains=()), callback='parse_items',process_links="filter_links",follow= True) ,
def filter_links(self, links):
for link in links:
if self.allowed_domains[0] not in link.url:
print link.url
return links
def parse_items(self, response):
### ...