Getting data from multiple links using scrapy - python

I am new to Scrapy and Python. I was trying to retrive the data from https://in.bookmyshow.com/movies since i need the information of all the movies I was trying to extract the data .But there is something wrong with my code, I would like to know where I have gone wrong .
rules = ( Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item

You code seems to be fine. Perhaps the problem is outside of the part you posted here.
This worked for me:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
class BookmyshowSpider(CrawlSpider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
rules = (Rule(SgmlLinkExtractor(allow=('https://in\.bookmyshow\.com/movies/.*', )), callback="parse_items", follow= True),)
def parse_items(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = Ex1Item()
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
EDIT: Version using the standard spider class scrapy.Spider()
import scrapy
class BookmyshowSpider(scrapy.Spider):
name = "bookmyshow"
start_urls = ['https://in.bookmyshow.com/movies']
allowed_domains = ['bookmyshow.com']
def parse(self, response):
links = response.xpath('//a/#href').re('movies/[^\/]+\/.*$')
for url in set(links):
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_movie)
def parse_movie(self, response):
for sel in response.xpath('//div[contains(#class, "movie-card")]'):
item = {}
item['Moviename'] = sel.xpath('.//a[#class="__movie-name"]/text()').extract()
item['Language'] = sel.xpath('/html/body/div[1]/div[2]/div/div[1]/div[2]/section[1]/div/div[2]/div[1]/div[1]/div/div/div[2]/div[2]/ul/li/text()').extract()
item['Info'] = sel.xpath('.//div[#class="__rounded-box __genre"]/text()').extract()
item['Synopsis'] = sel.xpath('/html/body/div[1]/div[2]/div[1]/div[2]/div[4]/div[2]/div[2]/blockquote/text()').extract()
item['Release'] = sel.xpath('.//span[#class="__release-date"]/text()').extract()
yield item
parse() parses all links to movie pages from the start page. parse_movie() is used as a callback for all Requests to the specific movie pages. With this version you certainly have more control over the spider behavior.

Related

Get price from XHR and Combine Scrapy

I have to scrape data(name, price, description, brand,...) on this website: https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww%7Cnew+in%7Cnew+products%7Cclothing
My code is as such:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
def remove_characters(self,value):
return value.strip('\n')
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def parse_item(self, response):
yield{
'name':response.xpath("//div[#class='product-hero']/h1/text()").get(),
'price':response.xpath("//span[#data-id='current-price']").get(),
'description':response.xpath("//div[#class='product-description']/ul/li/text()").getall(),
'about_me': response.xpath("//div[#class='about-me']//text()").getall(),
'brand_description':response.xpath("//div[#class='brand-description']/p/text()").getall()
}
However, due to javascript I cannot get the Price. I need to get it thorugh XHR.
My code for getting the price of only one item in the list is as followed:
import scrapy
import json
class AsosSpider(scrapy.Spider):
name = 'asos'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=200369183&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28']
def parse(self, response):
#print(response.body)
resp = json.loads(response.text)[0]
price = resp.get('productPrice').get('current').get('text')
print(price)
yield {
'price': price
Here, my start_urls is the Request URL. And it keeps changing for each item.
Item1: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=23443988&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Item2: https://www.asos.com/api/product/catalogue/v3/stockprice?productIds=22495685&store=ROW&currency=GBP&keyStoreDataversion=hnm9sjt-28
Only the productsIds are changing!!!
I need to insert the second code in the first code to get the price as well? How to do it please?
Thanks!
pix
items.py:
import scrapy
class AsosItem(scrapy.Item):
name = scrapy.Field()
price = scrapy.Field()
description = scrapy.Field()
about_me = scrapy.Field()
brand_description = scrapy.Field()
As I said in you last post I have a problem with this website on my computer for some reason, but you need to do something like this:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import AsosItem
class TestcrawlSpider(CrawlSpider):
name = 'testcrawl'
allowed_domains = ['www.asos.com']
start_urls = ['https://www.asos.com/women/new-in/new-in-clothing/cat/?cid=2623&nlid=ww|new+in|new+products|clothing']
rules = (
Rule(LinkExtractor(restrict_xpaths="//article[#class='_2qG85dG']/a"), callback='parse_item', follow=True),
Rule(LinkExtractor(restrict_xpaths="//a[#class='_39_qNys']")),
)
def remove_characters(self,value):
return value.strip('\n')
def parse_item(self, response):
price_url = 'https://www.asos.com' + re.search(r'window.asos.pdp.config.stockPriceApiUrl = \'(.+)\'', response.text).group(1)
item = AsosItem()
item['name'] = response.xpath("//div[#class='product-hero']/h1/text()").get()
item['description'] = response.xpath("//div[#class='product-description']/ul/li/text()").getall()
item['about_me'] = response.xpath("//div[#class='about-me']//text()").getall()
item['brand_description'] = response.xpath("//div[#class='brand-description']/p/text()").getall()
request = scrapy.Request(url=price_url, callback=self.parse_price)
request.meta['item'] = item
return request
def parse_price(self, response):
jsonresponse = response.json()[0]
price = jsonresponse['productPrice']['current']['text']
item = response.meta['item']
item['price'] = price
return item
Test the code and if it doesn't work then get the general idea and tweak it a bit, I can't test it myself.

Scrapy Craigslist script

I want to create a Scrapy script to scrape all of the results for computer gigs in any craigslist subdomain:
for example here: http://losangeles.craigslist.org/search/cpg/
This query returns a list of many articles and I've tried to scrape the title and href of each of this results (not only the ones on the first page) to no avail using CrawlSpider and linkExtractor, but the Script returns nothing.
I'll paste my script here, thanks
import scrapy
from scrapy.spiders import Rule,CrawlSpider
from scrapy.linkextractors import LinkExtractor
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = (
'http://losangeles.craigslist.org/search/cpg/',
)
rules = (Rule(LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)), callback="parse_page", follow= True),)
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link,title
According to the code you pasted, parse_page:
does not return/yield anything, and
only contains one line: "items = response.selector..."
The reason for #2 above is that the for loop is not properly indented.
Try to indent the for loop:
class CraigspiderSpider(CrawlSpider):
name = "CraigSpider"
allowed_domains = ["http://losangeles.craigslist.org"]
start_urls = ('http://losangeles.craigslist.org/search/cpg/',)
rules = (Rule(
LinkExtractor(allow=(), restrict_xpaths=('//a[#class="button next"]',)),
callback="parse_page", follow= True))
def parse_page(self, response):
items = response.selector.xpath("//p[#class='row']")
for i in items:
link = i.xpath("./span[#class='txt']/span[#class='pl']/a/#href").extract()
title = i.xpath("./span[#class='txt']/span[#class='pl']/a/span[#id='titletextonly']/text()").extract()
print link, title
yield dict(link=link, title=title)

scrapy spider code check

so im' trying to scrape the website in the SgmlLinkExtractor parameters below website with scrapy, and this is what my spider looks like:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from desidime_sample.items import DesidimeItem
import string
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = (
Rule(SgmlLinkExtractor(allow=(), restrict_xpaths=('''//td[not(#*)]/div
[not(#*)]/a[not(#class)]/#href''')), callback="parse_items", follow=True),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
deals = hxs.select('''//div[#class='user-comment-text'][1]''')
items = []
for deals in deals:
item = DesidimeItem()
item["deal"] = deals.select("//div[#class='user-comment-text'][1]/p/text()").extract()
item["link"] = deals.select("//div[#class='user-comment-text'][1]/p[1]/a[1]/#href").extract()
items.append(item)
return items
It should be quite obvious what I'm trying to do, but for some reason when I tell the spider to crawl and export the text and links to the CVS file, I end up with:
link,deal http://wwww.facebook.com/desidime,
http://wwww.facebook.com/desidime,
(same thing for many more lines, then:)
",,"
, " same url" ,
(same thing for many more lines, then:)
"link,deals"
So, can anyone tell me what the problem is? If you run each of my above xpaths as reponse.xpath("xpath").extract() after scrapy shell "//corresponingcrawlruleurl", you'll get the right results.
The problem is inside the parse_items callback. When you iterate over the deals, the deal context-specific locators have to be relative. In other words, start your XPath expressions inside the loop with a dot:
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item
(note that I've also simplified the code).
Here is the complete spider, I'm executing (it does scrape the text and links, though I don't know what is your desired output):
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class DesidimeItem(scrapy.Item):
deal = scrapy.Field()
link = scrapy.Field()
class DesidimeSpider(CrawlSpider):
name = "desidime"
allowed_domains = ["desidime.com"]
start_urls = ["http://www.desidime.com/forums/hot-deals-online"]
rules = [
Rule(LinkExtractor(restrict_xpaths="//td[not(#*)]/div[not(#*)]/a[not(#class)]"),
callback="parse_items",
follow=True),
]
def parse_items(self, response):
for deal in response.xpath("//div[#class='user-comment-text'][1]"):
item = DesidimeItem()
item["deal"] = deal.xpath(".//p/text()").extract()
item["link"] = deal.xpath(".//p[1]/a[1]/#href").extract()
yield item

Scrapy XPath selector

I am scraping this site and I'm using Scrapy as the means. However, I am having trouble with the XPath. I'm not entirely sure what is going on:
Why does this work:
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
yield item
and the following code not?
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//body'):
item['title'] = title.xpath('h1/strong/text()').extract()
item['wage'] = title.xpath('h1/span[#class="price"]/text()').extract()
yield item
I aim to also extract the XPath for:
//div[#id="description"]/p
But I can't because it is outside the h1 node. How can I achieve this? My full code is:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from bot.items import BotItem
class MufmufSpider(CrawlSpider):
name = 'mufmuf'
allowed_domains = ['mufmuf.ro']
start_urls = ['http://mufmuf.ro/locuri-de-munca/joburi-in-strainatate/']
rules = (
Rule(
LinkExtractor(restrict_xpaths='//div[#class="paginate"][position() = last()]'),
#callback='parse_start_url',
follow=True
),
Rule(
LinkExtractor(restrict_xpaths='//h3/a'),
callback='parse_item',
follow=True
),
def parse_item(self, response):
item = BotItem()
for title in response.xpath('//h1'):
item['title'] = title.xpath('strong/text()').extract()
item['wage'] = title.xpath('span[#class="price"]/text()').extract()
#item['description'] = title.xpath('div[#id="descirption"]/p/text()').extract()
yield item
The for title in response.xpath('//body'): option does not work because your XPath expressions in the loop make it search for h1 element directly inside the body element.
Moreover, since there is only one desired entity to extract you don't need a loop here at all:
def parse_item(self, response):
item = BotItem()
item["title"] = response.xpath('//h1/strong/text()').extract()
item["wage"] = response.xpath('//h1/span[#class="price"]/text()').extract()
item["description"] = response.xpath('//div[#id="description"]/p/text()').extract()
return item
(this should also answer your second question about the description)

Scrapy: only parse from pages with meta noindex

I am trying to crawl a website and parse only from pages with meta noindex.
What is happening is that the crawler crawls the first level, but finishes with the first page. It does not seem to follow the links.
The following is my code:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def _response_downloaded(self, response):
sel = HtmlXPathSelector(response)
if sel.xpath('//meta[#content="noindex"]'):
return super(mydomainSpider, self).parse_items(response)
return
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
the original _response_downloaded calls _parse_response function that besides calling callback function also follow links, from scrapy code:
def _parse_response(self, response, callback, cb_kwargs, follow=True):
if callback:
cb_res = callback(response, **cb_kwargs) or ()
cb_res = self.process_results(response, cb_res)
for requests_or_item in iterate_spider_output(cb_res):
yield requests_or_item
if follow and self._follow_links:
for request_or_item in self._requests_to_follow(response):
yield request_or_item
you can add that follow link part though I believe it's not the best way to go (leading _ may imply just that), why not just check for meta in the beginning of your parse_items function? and if you don't want to repeat this test maybe even write a python decorator.
I believe checking for the meta at the beginning of my parse_items as #Guy Gavriely suggested will be my best option. I will test out the following code below to see.
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
yield items
Working code update, I needed to return items instead of yield:
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from wallspider.items import Website
class mydomainSpider(CrawlSpider):
name = "0resultsTest"
allowed_domains = ["www.mydomain.com"]
start_urls = ["http://www.mydomain.com/cp/3944"]
rules = (
Rule(SgmlLinkExtractor(allow=(),deny=()), callback="parse_items", follow= True,),
)
def parse_items(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//html')
items = []
if hxs.xpath('//meta[#content="noindex"]'):
for site in sites:
item = Website()
item['url'] = response.url
item['referer'] = response.request.headers.get('Referer')
item['title'] = site.xpath('/html/head/title/text()').extract()
item['robots'] = site.select('//meta[#name="robots"]/#content').extract()
items.append(item)
return items

Categories