I'm not able to scrape data - python

I'm using scrapy to scrape data from the website. Here's my code
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
start_urls = ['http://https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers/']
custom_settings = {
'FEED_URI': 'tmp/shop.csv'
}
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
Please check where I'm doing wrong?
Also, I want to scrape all the data while I'm scrolling. So it should take all the data till we are scrolling? So how do I go about it?

You have problems with:
incorrect allowed_domain (only domain needed);
broken start_urls (http twice and slash in the end);
wrong intends for yielding item in parse function.
Check fixed code here:
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['shopclues.com']
start_urls = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info

Related

How to get data from a later function in scrapy

I'm having trouble structuring scrapy data as I want. My spider get some data from one page, then follows a list of links on that page to get a link of this next page.
def parse_page(self, response):
links = response.css(LINK_SELECTOR).extract()
data = {
'name': response.css(NAME_SELECTOR).extract_first(),
'date': response.css(DATE_SELECTOR).extract(),
}
for link in links:
next_link = response.urljoin(link)
yield scrapy.Request(next_link, callback=self.parse_url, meta={'data': data})
def parse_url(self, response):
data = response.meta['data']
data['url'] = response.css(a::attr(href)').get()
yield data
What I would like is to get the data with the following structure:
{'name': name, 'date': date, 'url': [url1, url2, url3, url4]}
Instead of
{'name': name, 'date': date, 'url': url1}
{'name': name, 'date': date, 'url': url2}
{'name': name, 'date': date, 'url': url3}
{'name': name, 'date': date, 'url': url4}
I've tried to use items but I don't get how to pass the data from parse_url to the parse_page function. How would I do that?
Thanks in advance.
You can use scrapy's coroutine support to do this pretty easily.
The code would look something like this:
async def parse_page(self, response):
...
for link in links:
request = response.follow(link)
response = await self.crawler.engine.download(request, self)
urls.append(response.css('a::attr(href)').get())
The following is one of the ways how you can achieve that. There is a library inline_requests which will help you get the expected output.
import scrapy
from scrapy.crawler import CrawlerProcess
from inline_requests import inline_requests
class YellowpagesSpider(scrapy.Spider):
name = "yellowpages"
start_urls = ["https://www.yellowpages.com/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771"]
#inline_requests
def parse(self, response):
data = {
'name':response.css(".sales-info > h1::text").get(),
'phone':response.css(".contact > p.phone::text").get(),
'target_link':[]
}
for item_link in response.css(".review-info > a.author[href]::attr(href)").getall():
resp = yield scrapy.Request(response.urljoin(item_link), meta={'handle_httpstatus_all': True})
target_link = resp.css("a.review-business-name::attr(href)").get()
data['target_link'].append(target_link)
print(data)
if __name__ == "__main__":
c = CrawlerProcess({
'USER_AGENT':'Mozilla/5.0',
'LOG_LEVEL':'ERROR',
})
c.crawl(YellowpagesSpider)
c.start()
Output it produces:
{'name': 'Honey Honey Cafe & Crepery', 'phone': '(415) 351-2423', 'target_link': ['/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771', '/walnut-ca/mip/akasaka-japanese-cuisine-455476824', '/san-francisco-ca/mip/honey-honey-cafe-crepery-4752771']}

Scrapy spider not scraping correct div

import scrapy
class rottenTomatoesSpider(scrapy.Spider):
name = "movieList"
start_urls = [
'https://www.rottentomatoes.com/'
]
def parse(self, response):
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]'):
yield {
'score': response.css('td.left_col').extract_first(),
'title': response.css('td.middle_col').extract_first(),
'openingDate': response.css('td.right_col right').extract_first()
}
So the spider is instead scraping <div id='homepage-tv-top'>
I'm assuming it is the homepage- that is confusing the script. Anyone know the workaround?
You need to iterate over each tr and and also in for loop use movieList instead of response
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]//tr'):
yield {
'score': "".join(a for a in movieList.css('td.left_col *::text').extract()),
'title': "".join(a for a in movieList.css('td.middle_col *::text').extract()),
'openingDate': "".join(a for a in movieList.css('td.right_col *::text').extract())
}

download and save images from a website using scrapy

I am new to scrapy and Python, so my question may be a simple one. By using an existing website guide, I've written a scraper which scrapes a website's pages and shows the images URL, name and ... in a output file. I want to download the images in a directory but the output directory is empty!
Here is my code:
myspider.py
import scrapy
class BrickSetSpider(scrapy.Spider):
name = 'brick_spider`enter code here`'
start_urls = ['http://brickset.com/sets/year-2016']
def parse(self, response):
SET_SELECTOR = '.set'
for brickset in response.css(SET_SELECTOR):
NAME_SELECTOR = 'h1 a ::text'
PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()'
MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()'
IMAGE_SELECTOR = 'img ::attr(src)'
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
settings.py
ITEM_PIPELINES = {'brickset.pipelines.BricksetPipeline': 1}
IMAGES_STORE = '/home/nmd/brickset/brickset/spiders/output'
#items.py
import scrapy
class BrickSetSpider(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass
Scrapy provides a media pipeline if your interested in downloading files or images
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
Then you need to add image_urls in your item for the pipeline to download the file, so change
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
to
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image_urls': brickset.css(IMAGE_SELECTOR).extract_first(),
}
For more details refer to https://doc.scrapy.org/en/latest/topics/media-pipeline.html

Scrapy code using python giving result for one website and not for another website

When i am executing this code i am getting result in the form of {[text1,author1,tag1],[text2,author2,tag2],...}
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
But, in the same code for another URL (below) i am getting result as {[name1,name2,..],[city1,city2,...]}
I want to have it in the form of {[name1,city1],[name2,city2],...] as it wah happening for the above code.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('div.list-pages'):
yield {
'name': students.css('div.title a::text').extract(),
'city': students.css('div.clg-state a::text').extract(),
}
Your students selector is faulty:
for students in response.css('div.list-pages'):
This only selects the whole page.
What you are looking for here I think is:
for students in response.css('li.search-result'):

Empty list returning by xpath in scrapy

I am working on scrapy , i am trying to gather some data from a site ,
Spider Code
class NaaptolSpider(BaseSpider):
name = "naaptol"
domain_name = "www.naaptol.com"
start_urls = ["http://www.naaptol.com/buy/mobile_phones/mobile_handsets.html"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
cell_matter = hxs.select('//div[#class="gridInfo"]/div[#class="gridProduct gridProduct_special"]')
items=[]
for i in cell_matter:
cell_names = i.select('//p[#class="proName"]/a/text()').extract()
prices = i.select('//p[#class="values"]/strong/text()').extract()
item = ExampleItem()
item['cell_name'] = cell_names
item['price'] = prices
items.append(item)
return [FormRequest(url="http://www.naaptol.com/faces/jsp/search/searchResults.jsp",
formdata={'type': 'cat_catlg',
'catid': '27',
'sb' : '9,8',
'frm' : '1',
'max' : '15',
'req': 'ajax'
},
callback=self.parse_item
)]
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
cell_matter = hxs.select('//div[#class="gridInfo"]/div[#class="gridProduct gridProduct_special"]')
for i in cell_matter:
cell_names = i.select('//p[#class="proName"]/a/text()').extract()
prices = i.select('//p[#class="values"]/strong/text()').extract()
print cell_names
print prices
Result:
2012-06-15 09:38:36+0530 [naaptol] DEBUG: Crawled (200) <POST http://www.naaptol.com/faces/jsp/search/searchResults.jsp> (referer: http://www.naaptol.com/buy/mobile_phones/mobile_handsets.html)
[]
[]
Actually i had posted the form to achieve the pagination which is in javascript
Here i am receiving the response from parse method in parse_item method, but when i used the xpath same as in parse method its returning an empty list as above, can anyone tell me why its returning an empty array, and whats wrong in my code.
Thanks in advance
The response is in JSON format:
{
"prodList": [
{
"pid": "955492",
"pnm": "Samsung Star 3 Duos",
"mctid": "27",
"pc": "5,650",
"mrp": "6290",
"pdc": "10",
"pimg": "Samsung-Star-3-duos-1.jpg",
"rt": "8",
"prc": "1",
"per": "Y",
(...)
},
(...)
}
In order to parse it, you can use python's json module. An example of what you are trying to achieve is here: Empty list for hrefs to achieve pagination through JavaScript onclick functions.

Categories