Python + Scrapy renaming downloaded images - python

IMPORTANT NOTE: all the answers available at the moment on stackoverflow are for previous versions of Scrapy and don't work with the latest version of scrapy 1.4
Totally new to scrapy and python, I am trying to scrape some pages and download the images. The images are being downloaded but they still have the original SHA-1 name as filenames. I cannot figure out how to rename the files, they actually all have the SHA-1 filenames.
Tryed to rename them as "test", and I do have "test" appearing in the outputs when I run scrapy crawl rambopics , along with the url's data. But the files dont get renamed in the destination folder. Here is a sample of the output:
> 2017-06-11 00:27:06 [scrapy.core.scraper] DEBUG: Scraped from <200
> http://www.theurl.com/> {'image_urls':
> ['https://www.theurl.com/-a4Bj-ENjHOY/VyE1mGuJyUI/EAAAAAAAHMk/mw1_H-mEAc0QQEwp9UkTipxNCVR-xdbcgCLcB/s1600/Image%2B%25286%2525.jpg'],
> 'image_name': ['test'], 'title': ['test'], 'filename': ['test'],
> 'images': [{'url':
> 'https://www.theurl.com/-a4Bj-ENjHOY/VyE1mGuJyUI/EAAAAAAAHMk/mw1_H-mEAc0QQEwp9UkTipxNCVR-xdbcgCLcB/s1600/Image%2B%25286%2525.jpg',
> 'path': 'full/fcbec9bf940b48c248213abe5cd2fa1c690cb879.jpg',
> 'checksum': '7be30d939a7250cc318e6ef18a6b0981'}]}
So far I have tried many different solutions all posted on stackoverflow, there is just no clear answer to that question for the latest version of scrapy in 2017, it looks like the propositions are probably almost all outdated. I am using Scrapy 1.4 with python 3.6.
scrapy.cfg
# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html
[settings]
default = rambopics.settings
[deploy]
#url = http://localhost:6800/
project = rambopics
items.py
import scrapy
class RambopicsItem(scrapy.Item):
# defining items:
image_urls = scrapy.Field()
images = scrapy.Field()
image_name = scrapy.Field()
title = scrapy.Field()
#pass -- dont realy understand what pass is for
settings.py
BOT_NAME = 'rambopics'
SPIDER_MODULES = ['rambopics.spiders']
NEWSPIDER_MODULE = 'rambopics.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = "W:/scraped/"
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
class RambopicsPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
img_url = item['img_url']
meta = {
'filename': item['title'],
'title': item['image_name']
}
yield Request(url=img_url, meta=meta)
(the spider) rambopics.py
from rambopics.items import RambopicsItem
from scrapy.selector import Selector
import scrapy
class RambopicsSpider(scrapy.Spider):
name = 'rambopics'
allowed_domains = ['theurl.com']
start_urls = ['http://www.theurl.com/']
def parse(self, response):
for sel in response.xpath('/html'):
#img_name = sel.xpath("//h3[contains(#class, 'entry-title')]/a/text()").extract()
img_name = 'test'
#img_title = sel.xpath("//h3[contains(#class, 'entry-title')]/a/text()").extract()
img_title = 'test'
for elem in response.xpath("//div[contains(#class, 'entry-content')]"):
img_url = elem.xpath("a/#href").extract_first()
yield {
'image_urls': [img_url],
'image_name': [img_name],
'title': [img_title],
'filename': [img_name]
}
Note, I don't know what the correct meta name to use is for the final downloaded file name (I'm not sure if it's filename, image_name, or title).

Use file_path method to change image names as follows:
class SaveImagesPipeline(FilesPipeline):
def get_media_requests(self, item, info):
i = 1
for image_url in item['image_urls']:
filename = '{}_{}.jpg'.format(item['name_image'], i)
yield scrapy.Request(image_url, meta={'filename': filename})
i += 1
return
def file_path(self, request, response=None, info=None):
return request.meta['filename']

Related

Scrapy Image Pipeline not downloading images

I am trying to scrape a website using scrapy to download images. When I run the code, it runs very well but it doesn't download the images even after I have specified the image pipeline nad directory in my settings.py
spider.py
import re
import scrapy
import os
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import ImagesItem
class ImageSpiderSpider(CrawlSpider):
name = 'image_spider'
allowed_domains = ['books.toscrape.com']
# start_urls = ['http://books.toscrape.com/']
def start_requests(self):
url = 'http://books.toscrape.com/'
yield scrapy.Request(url=url)
rules = (
Rule(LinkExtractor(allow=r'catalogue/'), callback='parse_image', follow=True),
)
# save_location = os.getcwd()
custom_settings = {
"ITEM_PIPELINES": {'scrapy.pipelines.images.ImagesPipeline': 1},
"IMAGES_STORE": '.images_download/full'
}
def parse_image(self, response):
if response.xpath('//div[#class="item active"]/img').get() is not None:
img = response.xpath('//div[#class="item active"]/img/#src').get()
"""
Computing the Absolute path of the image file.
"image_urls" require absolute path, not relative path
"""
m = re.match(r"^(?:../../)(.*)$", img).group(1)
url = "http://books.toscrape.com/"
img_url = "".join([url, m])
image = ImagesItem()
image["image_urls"] = [img_url] # "image_urls" must be a list
yield image
items.py
import scrapy
class ImagesItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
settings.py
BOT_NAME = 'images'
SPIDER_MODULES = ['images.spiders']
NEWSPIDER_MODULE = 'images.spiders'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {"scrapy.pipelines.images.ImagesPipeline": 1}
IMAGES_STORE = "/Home/PycharmProjects/scrappy/images/images_downloader"
I can't test it as project but I tested your code as standalone script - and it works for me.
I put all this code in one file script.py and run it as python script.py
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class ImagesItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
class ImageSpiderSpider(CrawlSpider):
name = 'image_spider'
allowed_domains = ['books.toscrape.com']
start_urls = ['http://books.toscrape.com/']
rules = (
Rule(LinkExtractor(allow=r'catalogue/'), callback='parse_image', follow=True),
)
custom_settings = {
"ITEM_PIPELINES": {'scrapy.pipelines.images.ImagesPipeline': 1},
"IMAGES_STORE": '.',
}
def parse_image(self, response):
img = response.xpath('//div[#class="item active"]/img/#src').get()
if img:
img_url = response.urljoin(img)
#image = dict()
image = ImagesItem()
image["image_urls"] = [img_url] # "image_urls" must be a list
yield image
# --- run without project and save in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
#'USER_AGENT': 'Mozilla/5.0',
# save in file CSV, JSON or XML
'FEEDS': {'output.csv': {'format': 'csv'}}, # new in 2.1
#'ITEM_PIPELINES': {'scrapy.pipelines.images.ImagesPipeline': 1}, # used standard ImagePipeline (download to IMAGES_STORE/full)
#'IMAGES_STORE': '.', # this folder has to exist before downloading
})
c.crawl(ImageSpiderSpider)
c.start()
It creates subfolder full with images which have names like 0a007ac89083ad8b68c56ec0f8df5a811e76607c.jpg because standard pipeline uses hash code as name.
It also creates file output.csv with rows like
image_urls,images
http://books.toscrape.com/media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg,"[{'url': 'http://books.toscrape.com/media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg', 'path': 'full/d78460eb2aa4417e52a8d9850934e35ef6b6117f.jpg', 'checksum': 'e7f8ece4eab2ff898a20ce53b4b50dcb', 'status': 'downloaded'}]"
The same information I see also directly in console
{'image_urls': ['http://books.toscrape.com/media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg'],
'images': [{'checksum': '693caff3d97645e73bd28da8e5974946',
'path': 'full/59d0249d6ae2eeb367e72b04740583bc70f81558.jpg',
'status': 'downloaded',
'url': 'http://books.toscrape.com/media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg'}]}

Save downloaded files with custom names in scrapy

I am new to scrapy.I downloaded some files using the code bellow. I want to change the names of my downloaded files but I don't know how.
For example, I want to have a list containing names and use it to rename the files that I downloaded.
Any help will be appreciated
my spider
import scrapy from scrapy.loader
import ItemLoader from demo_downloader.items
import DemoDownloaderItem
class FileDownloader(scrapy.Spider):
name = "file_downloader"
def start_requests(self):
urls = [
"https://www.data.gouv.fr/en/datasets/bases-de-donnees-annuelles-des-accidents-corporels-de-la-circulation-routiere-annees-de-2005-a-2019/#_"
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
for link in response.xpath('//article[#class = "card resource-card "]'):
name = link.xpath('.//h4[#class="ellipsis"]/text()').extract_first()
if ".csv" in name:
loader = ItemLoader(item=DemoDownloaderItem(), selector=link)
absolute_url = link.xpath(".//a[#class = 'btn btn-sm btn-primary']//#href").extract_first()
loader.add_value("file_urls", absolute_url)
loader.add_value("files", name)
yield loader.load_item()
items.py
from scrapy.item import Field, Item
class DemoDownloaderItem(Item):
file_urls = Field()
files = Field()
pipelines.py
from itemadapter import ItemAdapter
class DemoDownloaderPipeline:
def process_item(self, item, spider):
return item
settings.py
BOT_NAME = 'demo_downloader'
SPIDER_MODULES = ['demo_downloader.spiders']
NEWSPIDER_MODULE = 'demo_downloader.spiders'
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1
}
DOWNLOAD_TIMEOUT = 1200
FILES_STORE = "C:\\Users\\EL\\Desktop\\work\\demo_downloader"
MEDIA_ALLOW_REDIRECTS = True

Scrapy - Scrape both text and images in the same spider

Scrapy super noob here. Problem: I have an html page that contains both information that I want to scrape and an url that I want to follow to get images urls for images that I want to download and save via the scrapy image pipeline.
My approach to achieve this:
1. Scrape all the details as usual with a parse method
2. Find the url in the initial page, create a request that has a second parse method as callback where I build the image_urls list.
So, I have the following setup:
settings.py
...
ITEM_PIPELINES = {
'crawlbot.pipelines.MybotPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = '/url/to/images' #valid path to actual folder
...
pipelines.py
import pymongo
class MybotPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient('localhost', 27017)
db = self.conn['libraries']
self.collection = db['books']
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
items.py
import scrapy
class MybotItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
description = scrapy.Field()
crawler.py
import scrapy
from scrapy.spiders import CrawlSpider
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['books.com']
def start_requests(self):
urls = [
'https://www.books.com/some/url'
]
custom_settings = {
'DEPTH_LIMIT': 1
}
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_details(self, response):
for image in enumerate(response.xpath('//div[contains(#class, "jumbotron")]/div')):
image_urls = image.xpath('div[contains(#class, "jumbotron-image")]/img/#src').getall()
def parse_item(self, response):
for idx, list_item in enumerate(response.xpath('//div[contains(#class, "slider-wrapper")]')):
anchor = list_item.xpath('div[contains(#class, "slider-section")]/div/a')
slider_thumbnail = anchor.xpath('div[contains(#class, "slider-thumbnail")]')
description = slider_thumbnail.xpath('div[contains(#class, "description-box")]')
yield {
'url': anchor.xpath('#href').get(),
'description': description
}
details_page_urls = anchor.xpath('#href').getall()
for details_page in details_page_urls:
yield scrapy.Request(url=details_page, callback=self.parse_details)
This is not working, although with my little knowledge of both Scrapy and Python, the second parse method should return a list of image_urls. So I have 2 questions: 1. is there a better approach for my case? Maybe the whole issue is in trying to do too much with one spider? 2. If the approach is ok, what am I doing wrong?

Scrapy outputs blank CSV

scrapy crawl raamatuvahetus -o raamatuvahetus.csv is outputting an empty csv file. I have no idea why this is. All other Scrapy files generated by scrapy startproject are untouched, and all settings are left at default.
import scrapy
from scrapy.exceptions import CloseSpider
class RaamatuvahetusSpider(scrapy.Spider):
name = 'raamatuvahetus'
start_urls = ['https://www.raamatuvahetus.ee/et/bookwished.wishall?limit=200']
def parse(self, response):
for href in response.xpath("//a[#class='b-info']/#href"):
yield response.follow(href, callback=self.parse_book)
def parse_book(self, response):
wishings = response.xpath("//img[#class='uimg']")
wishings_count = 0
if wishings:
wishings_count = len(wishings)
if wishings_count < 15:
raise CloseSpider('Wishings fever than 15.')
title = response.xpath("//article[#class='text']/h1/text()").extract_first()
author = response.xpath("//div[#class='author']/a/text()").extract_first()
year = response.xpath("//div[#class='year']/text()").extract_first()
yield
{
"Pealkiri": title,
"Autor": author,
"Aasta": year,
"Soovid": wishings_count
}
Edit:
Solved! Heed all travelers who accost a similar complication -- fret not! I have the answers you seek.
Instead of
yield
{
}
write
yield {
}

Scrapy Images Downloading

My spider runs without displaying any errors but the images are not stored in the folder here are my scrapy files:
Spider.py:
import scrapy
import re
import os
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem, ListResidentialItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["someurl.com"]
start_urls = [
"someurl.com"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[#data-tealium-id="detail_nav_showphotos"]/#href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseBasicListingInfo, meta={'item': item})
def parseBasicListingInfo(item, response):
item = response.request.meta['item']
item = ListResidentialItem()
try:
image_urls = map(unicode.strip,response.xpath('//a[#itemprop="contentUrl"]/#data-href').extract())
item['image_urls'] = [ x for x in image_urls]
except IndexError:
item['image_urls'] = ''
return item
settings.py:
from scrapy.settings.default_settings import ITEM_PIPELINES
from scrapy.pipelines.images import ImagesPipeline
BOT_NAME = 'production'
SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
DEPTH_PRIORITY = 1
IMAGE_STORE = '/images'
CONCURRENT_REQUESTS = 250
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {
'scrapy.contrib.pipeline.images.ImagesPipeline': 300,
}
items.py
# -*- coding: utf-8 -*-
import scrapy
class ProductionItem(scrapy.Item):
img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pass
My pipeline file is empty i'm not sure what i am suppose to add to the pipeline.py file.
Any help is greatly appreciated.
My Working end result:
spider.py:
import scrapy
import re
import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.loader.processors import Join, MapCompose, TakeFirst
from scrapy.pipelines.images import ImagesPipeline
from production.items import ProductionItem
from production.items import ImageItem
class productionSpider(scrapy.Spider):
name = "production"
allowed_domains = ["url"]
start_urls = [
"startingurl.com"
]
def parse(self, response):
for sel in response.xpath('//html/body'):
item = ProductionItem()
img_url = sel.xpath('//a[#idd="followclaslink"]/#href').extract()[0]
yield scrapy.Request(urlparse.urljoin(response.url, img_url),callback=self.parseImages, meta={'item': item})
def parseImages(self, response):
for elem in response.xpath("//img"):
img_url = elem.xpath("#src").extract_first()
yield ImageItem(image_urls=[img_url])
Settings.py
BOT_NAME = 'production'
SPIDER_MODULES = ['production.spiders']
NEWSPIDER_MODULE = 'production.spiders'
DEFAULT_ITEM_CLASS = 'production.items'
ROBOTSTXT_OBEY = True
IMAGES_STORE = '/Users/home/images'
DOWNLOAD_DELAY = 2
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
# Disable cookies (enabled by default)
items.py
# -*- coding: utf-8 -*-
import scrapy
class ProductionItem(scrapy.Item):
img_url = scrapy.Field()
# ScrapingList Residential & Yield Estate for sale
class ListResidentialItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
class ImageItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
Since you don't know what to put in the pipelines I assume you can use the default pipeline for images provided by scrapy so in the settings.py file you can just declare it like
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline':1
}
Also, your images path is wrong the / means that you are going to the absolute root path of your machine, so you either put the absolute path to where you want to save or just do a relative path from where you are running your crawler
IMAGES_STORE = '/home/user/Documents/scrapy_project/images'
or
IMAGES_STORE = 'images'
Now, in the spider you extract the url but you don't save it into the item
item['image_urls'] = sel.xpath('//a[#data-tealium-id="detail_nav_showphotos"]/#href').extract_first()
The field has to literally be image_urls if you're using the default pipeline.
Now, in the items.py file you need to add the following 2 fields (both are required with this literal name)
image_urls=Field()
images=Field()
That should work
In my case it was the IMAGES_STORE path that was causing the problem
I did IMAGES_STORE = 'images' and it worked like a charm!
Here is complete code:
Settings:
ITEM_PIPELINES = {
'mutualartproject.pipelines.MyImagesPipeline': 1,
}
IMAGES_STORE = 'images'
Pipline:
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item
Just adding my misstake here which threw me of for several hours. Perhaps it can help someone.
From scrapy docs (https://doc.scrapy.org/en/latest/topics/media-pipeline.html#using-the-images-pipeline):
Then, configure the target storage setting to a valid value that will be used for storing the downloaded images. Otherwise the pipeline will remain disabled, even if you include it in the ITEM_PIPELINES setting.
For some reason I used a colon ":" instead of an equal sign "=".
# My misstake:
IMAGES_STORE : '/Users/my_user/images'
# Working code
IMAGES_STORE = '/Users/my_user/images'
This dosen't return an error but instead leads to the pipeline not loading at all which for me was pretty hard to trouble shoot.
You have to enable SPIDER_MIDDLEWARES and DOWNLOADER_MIDDLEWARES in the settings.py file

Categories