download and save images from a website using scrapy - python

I am new to scrapy and Python, so my question may be a simple one. By using an existing website guide, I've written a scraper which scrapes a website's pages and shows the images URL, name and ... in a output file. I want to download the images in a directory but the output directory is empty!
Here is my code:
myspider.py
import scrapy
class BrickSetSpider(scrapy.Spider):
name = 'brick_spider`enter code here`'
start_urls = ['http://brickset.com/sets/year-2016']
def parse(self, response):
SET_SELECTOR = '.set'
for brickset in response.css(SET_SELECTOR):
NAME_SELECTOR = 'h1 a ::text'
PIECES_SELECTOR = './/dl[dt/text() = "Pieces"]/dd/a/text()'
MINIFIGS_SELECTOR = './/dl[dt/text() = "Minifigs"]/dd[2]/a/text()'
IMAGE_SELECTOR = 'img ::attr(src)'
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
NEXT_PAGE_SELECTOR = '.next a ::attr(href)'
next_page = response.css(NEXT_PAGE_SELECTOR).extract_first()
if next_page:
yield scrapy.Request(
response.urljoin(next_page),
callback=self.parse
)
settings.py
ITEM_PIPELINES = {'brickset.pipelines.BricksetPipeline': 1}
IMAGES_STORE = '/home/nmd/brickset/brickset/spiders/output'
#items.py
import scrapy
class BrickSetSpider(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass

Scrapy provides a media pipeline if your interested in downloading files or images
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
Then you need to add image_urls in your item for the pipeline to download the file, so change
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image': brickset.css(IMAGE_SELECTOR).extract_first(),
}
to
yield {
'name': brickset.css(NAME_SELECTOR).extract_first(),
'pieces': brickset.xpath(PIECES_SELECTOR).extract_first(),
'minifigs': brickset.xpath(MINIFIGS_SELECTOR).extract_first(),
'image_urls': brickset.css(IMAGE_SELECTOR).extract_first(),
}
For more details refer to https://doc.scrapy.org/en/latest/topics/media-pipeline.html

Related

How to use Scrapy Items and store output in json format?

I'm trying to get my output to look like the following in json format.
[{"title": "Test", "kategorie": "abc", "url": "www.url.com"},
{"title": "Test", "kategorie": "xyz", "url": "www.url.com"},
{"title": "Test", "kategorie": "sca", "url": "www.url.com"}]
but after using Items I see some of the values but not all of them are stored in a list:
[{"title": ["Test"], "kategorie": ["abc"], "url": "www.url.com"},
{"title": ["Test"], "kategorie": ["xyz"], "url": "www.url.com"},
{"title": ["Test"], "kategorie": ["sca"], "url": "www.url.com"}]
This is my items.py
class MyItem(scrapy.Item):
title = scrapy.Field()
kategorie = scrapy.Field()
url = scrapy.Field()
This is my pipelines.py which is enabled in settings.py.
class MyPipeline(object):
file = None
def open_spider(self, spider):
self.file = open('item.json', 'wb')
self.exporter = JsonItemExporter(self.file)
self.exporter.start_exporting()
def close_spider(self, spider):
self.exporter.finish_exporting()
self.file.close()
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
This is the parse method in my spider.py. All xpath-methods return a list of scraped values. After it they are put together and iteratively create a dictionary that will end up in the exported file as json.
def parse(self, response):
item = MyItem()
title = response.xpath('//h5/text()').getall()
kategorie = response.xpath('//span[#class="abc1"]//text()').getall()
url = response.xpath('//div[#class="abc3"]//a/#href').getall()
data = zip(title, kategorie, url)
for i in data:
item['title'] = i[0],
item['kategorie'] = i[1],
item['url'] = i[2]
yield item
This is how I start the crawling process:
scrapy crawl spider_name
If I don't use Items and Pipelines it works fine using:
scrapy crawl spider_name -o item.json
I am wondering why some of the values are stored in a list and some other are not. If someone has an approach it would be really great.
Using scrapy FEEDS and Item you can directly yield the item objects from the parse method without the need for pipelines or ziping the lists first. See below sample
import scrapy
class MyItem(scrapy.Item):
title = scrapy.Field()
kategorie = scrapy.Field()
url = scrapy.Field()
class SampleSpider(scrapy.Spider):
name = 'sample'
start_urls = ['https://brownfield24.com/grundstuecke']
custom_settings = {
"FEEDS": {
"items.json":{
"format": "json"
}
}
}
def parse(self, response):
for property in response.xpath("//*[contains(#class,'uk-link-reset')]"):
item = MyItem()
item['title'] = property.xpath(".//h5/text()").get()
item['url'] = property.xpath(".//a/#href").get()
item['kategorie'] = property.xpath(".//div[#class='uk-card-body']/p/span/text()").get()
yield item
Running the spider using scrapy crawl sample will obtain below output.

I'm not able to scrape data

I'm using scrapy to scrape data from the website. Here's my code
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
start_urls = ['http://https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers/']
custom_settings = {
'FEED_URI': 'tmp/shop.csv'
}
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info
Please check where I'm doing wrong?
Also, I want to scrape all the data while I'm scrolling. So it should take all the data till we are scrolling? So how do I go about it?
You have problems with:
incorrect allowed_domain (only domain needed);
broken start_urls (http twice and slash in the end);
wrong intends for yielding item in parse function.
Check fixed code here:
import scrapy
class ShopSpider(scrapy.Spider):
name = 'shop'
allowed_domains = ['shopclues.com']
start_urls = ['https://www.shopclues.com/mobiles-smartphones.html?sort_by=bestsellers']
def parse(self, response):
titles = response.css('img::attr(title)').extract()
images = response.css('img::attr(data-img)').extract()
prices = response.css('.p_price::text').extract()
discounts = response.css('.prd_discount::text').extract()
for item in zip(titles, prices, images, discounts):
scraped_info = {
'title': item[0],
'price': item[1],
'image_urls': [item[2]], # Set's the url for scrapy to download images
'discount': item[3]
}
yield scraped_info

Scrapy spider not scraping correct div

import scrapy
class rottenTomatoesSpider(scrapy.Spider):
name = "movieList"
start_urls = [
'https://www.rottentomatoes.com/'
]
def parse(self, response):
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]'):
yield {
'score': response.css('td.left_col').extract_first(),
'title': response.css('td.middle_col').extract_first(),
'openingDate': response.css('td.right_col right').extract_first()
}
So the spider is instead scraping <div id='homepage-tv-top'>
I'm assuming it is the homepage- that is confusing the script. Anyone know the workaround?
You need to iterate over each tr and and also in for loop use movieList instead of response
for movieList in response.xpath('//div[#id="homepage-opening-this-week"]//tr'):
yield {
'score': "".join(a for a in movieList.css('td.left_col *::text').extract()),
'title': "".join(a for a in movieList.css('td.middle_col *::text').extract()),
'openingDate': "".join(a for a in movieList.css('td.right_col *::text').extract())
}

Scrapy code using python giving result for one website and not for another website

When i am executing this code i am getting result in the form of {[text1,author1,tag1],[text2,author2,tag2],...}
import scrapy
class QuotesSpider(scrapy.Spider):
name = "quotes"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield {
'text': quote.css('span.text::text').extract_first(),
'author': quote.css('small.author::text').extract_first(),
'tags': quote.css('div.tags a.tag::text').extract(),
}
But, in the same code for another URL (below) i am getting result as {[name1,name2,..],[city1,city2,...]}
I want to have it in the form of {[name1,city1],[name2,city2],...] as it wah happening for the above code.
import scrapy
class QuotesSpider(scrapy.Spider):
name = "student"
start_urls = [
'http://www.engineering.careers360.com/colleges/list-of-engineering-colleges-in-karnataka?sort_filter=alpha',
]
def parse(self, response):
for students in response.css('div.list-pages'):
yield {
'name': students.css('div.title a::text').extract(),
'city': students.css('div.clg-state a::text').extract(),
}
Your students selector is faulty:
for students in response.css('div.list-pages'):
This only selects the whole page.
What you are looking for here I think is:
for students in response.css('li.search-result'):

Scrapy Spider:relative link and absolute link

There is an example in Scrapy Documentation Release 1.0.3,in 7th row, urljoin method is used when the links is relative.when the links is absolute,what should i do?
example code:
import scrapy
class StackOverflowSpider(scrapy.Spider):
name = 'stackoverflow'
start_urls = ['http://stackoverflow.com/questions?sort=votes']
def parse(self, response):
for href in response.css('.question-summary h3 a::attr(href)'):
full_url = response.urljoin(href.extract())
yield scrapy.Request(full_url, callback=self.parse_question)
def parse_question(self, response):
yield {
'title': response.css('h1 a::text').extract()[0],
'votes': response.css('.question .vote-count-post::text').extract()[0],
'body': response.css('.question .post-text').extract()[0],
'tags': response.css('.question .post-tag::text').extract(),
'link': response.url,
}
You don't need to worry about, urljoin() handles both cases properly:
In [1]: response.urljoin("http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery")
Out[1]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'
In [2]: response.urljoin("/questions/426258/checking-a-checkbox-with-jquery")
Out[2]: 'http://stackoverflow.com/questions/426258/checking-a-checkbox-with-jquery'

Categories