I'm trying to scrape a few images from a website. Sorry in advance, I am not very experimented with Python and it is the first time I try using scrapy.
I manage apparently to get all the images I need, but they somehow get lost and my output folder remains empty.
I looked at a few tutorials and all the similar questions I could find on SO, but nothing seemed to really work out.
My spider:
from testspider.items import TestspiderItem
import datetime
import scrapy
class PageSpider(scrapy.Spider):
name = 'page-spider'
start_urls = ['http://scan-vf.co/one_piece/chapitre-807/1']
def parse(self, response):
SET_SELECTOR = '.img-responsive'
page = 1
for imgPage in response.css(SET_SELECTOR):
IMAGE_SELECTOR = 'img ::attr(src)'
imgURL = imgPage.css(IMAGE_SELECTOR).extract_first()
title = 'op-807-' + str(page)
page += 1
yield TestspiderItem({'title':title, 'image_urls':[imgURL]})
My items:
import scrapy
class TestspiderItem(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()
My settings:
BOT_NAME = 'testspider'
SPIDER_MODULES = ['testspider.spiders']
NEWSPIDER_MODULE = 'testspider.spiders'
DEFAULT_ITEM_CLASS = 'testspider.items'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGE_STORE = '/home/*******/documents/testspider/output'
If you could be so kind as to help me understanding what's missing / what's incorrect, I would be grateful
If you check a source code (usually Ctrl+U in a browser) you'll find that each img is a something like this:
<img class="img-responsive" src="" data-src=' https://www.scan-vf.co/uploads/manga/one_piece/chapters/chapitre-807/01.jpg ' alt='One Piece: Chapter chapitre-807 - Page 1'/>
As you can see you need to use data-src in your code instead of src:
IMAGE_SELECTOR = 'img ::attr(data-src)'
Related
Scrapy super noob here. Problem: I have an html page that contains both information that I want to scrape and an url that I want to follow to get images urls for images that I want to download and save via the scrapy image pipeline.
My approach to achieve this:
1. Scrape all the details as usual with a parse method
2. Find the url in the initial page, create a request that has a second parse method as callback where I build the image_urls list.
So, I have the following setup:
settings.py
...
ITEM_PIPELINES = {
'crawlbot.pipelines.MybotPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = '/url/to/images' #valid path to actual folder
...
pipelines.py
import pymongo
class MybotPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient('localhost', 27017)
db = self.conn['libraries']
self.collection = db['books']
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
items.py
import scrapy
class MybotItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
description = scrapy.Field()
crawler.py
import scrapy
from scrapy.spiders import CrawlSpider
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['books.com']
def start_requests(self):
urls = [
'https://www.books.com/some/url'
]
custom_settings = {
'DEPTH_LIMIT': 1
}
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_details(self, response):
for image in enumerate(response.xpath('//div[contains(#class, "jumbotron")]/div')):
image_urls = image.xpath('div[contains(#class, "jumbotron-image")]/img/#src').getall()
def parse_item(self, response):
for idx, list_item in enumerate(response.xpath('//div[contains(#class, "slider-wrapper")]')):
anchor = list_item.xpath('div[contains(#class, "slider-section")]/div/a')
slider_thumbnail = anchor.xpath('div[contains(#class, "slider-thumbnail")]')
description = slider_thumbnail.xpath('div[contains(#class, "description-box")]')
yield {
'url': anchor.xpath('#href').get(),
'description': description
}
details_page_urls = anchor.xpath('#href').getall()
for details_page in details_page_urls:
yield scrapy.Request(url=details_page, callback=self.parse_details)
This is not working, although with my little knowledge of both Scrapy and Python, the second parse method should return a list of image_urls. So I have 2 questions: 1. is there a better approach for my case? Maybe the whole issue is in trying to do too much with one spider? 2. If the approach is ok, what am I doing wrong?
I am newbie to scrapy. I am trying to download an image.
import scrapy
from scrapy.http import Request
class PlayerSpider(scrapy.Spider):
name = 'player'
#allowed_domains = ['nba.com/players']
start_urls = ['http://www.nba.com/players/']
def parse(self, response):
Player_Name = response.css('div#content.nba-player-index a ::attr(title)').extract()
Player_link = response.css('.nba-player-index__trending-item a::attr(href)').extract()
links = [url for url in Player_link if url.startswith("/players")]
for link in links:
absolute_url = response.urljoin(link)
yield Request(absolute_url, callback=self.parse_players)
def parse_players(self, response):
Player_Name = response.css('section.nba-player-header__details-bottom ::text').extract()
items=[]
for images in Player_Name:
item = PlayerSpider()
images_link = response.css('section.nba-detail-header-wrapper .nba-player-header__headshot img::attr(src)').extract_first()
image_urls = 'http:{}'.format(images_link)
item[image_urls]
return item
Player_Height = response.css('section.nba-player-vitals__top-left.small-6 p.nba-player-vitals__top-info-imperial ::text').extract()
Player_Weight = response.css('section.nba-player-vitals__top-right.small-6 p.nba-player-vitals__top-info-imperial ::text').extract()
yield {
'Player_name' : Player_Name,
'Player_Height' : Player_Height,
'Player Weight' : Player_Weight
}
I think files are good. But I am unable to write correct spider for getting the image. I am able to grab the image URL but don't know how to store the image using imagePipeline.
items.py
import scrapy
from scrapy.item import Item
class PlayerSpider(scrapy.Item):
image_url = scrapy.Field()
images = scrapy.Field()
pass
To enable your image pipeline you must first add it to your project ITEM_PIPELINES setting.
settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = 'images'
link
I am trying to grab details from a real estate listing page. I can grab all the data, I just can't seem to export it..
Perhaps a problem with the way I use the yield keyword. The code work for the most part:
Visits page 1, example.com/kittens
Goes to page 2, example.com/puppers. Here are 10 apartments listed in blocks. I can get data from each block, but I need additional info from inside the hyperlink.
Visits the hyperlink, say, example.com/puppers/apartment1. It grabs some info from here as well, but I can't seem to return this data to include it in my HousingItem() class.
import scrapy
from urllib.parse import urljoin
class HousingItem(scrapy.Item):
street = scrapy.Field()
postal = scrapy.Field()
city = scrapy.Field()
url = scrapy.Field()
buildY = scrapy.Field()
on_m = scrapy.Field()
off_m = scrapy.Field()
class FAppSpider(scrapy.Spider):
name = 'f_app'
allowed_domains = ['example.com']
start_urls = ['https://www.example.com/kittens']
def parse(self, response):
yield scrapy.Request(url="https://www.example.com/puppers",
callback=self.parse_puppers)
def parse_inside_pupper(self, response):
item = HousingItem()
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
# Problem area from here..
yield response.follow(url=item['url'],callback=self.parse_inside_pupper)
# yield scrapy.request(url=item['url'],callback=self.parse_inside_pupper)?
yield item
FEED_EXPORT_FIELDS is adjusted in my SETTINGS.py. The 4 items from parse_puppers() get exported correctly, parse_inside_puppers() data is correct in the console, but wont export.
I use scrapy crawl f_app -o raw_data.csv to run me spider. Thanks in advance, appreciate all the help.
p.s. im fairly new to python and practising, i bet you noticed.
You need to send you current item to the parse_inside_pupper using meta param:
def parse_puppers(self, response):
base_url = 'https://www.example.com/'
for block in response.css('div.search-result-main'):
item = HousingItem()
item['street'] = block.css(''),
item['postcode'] = block.css(''),
item['city'] = block.css('')
item['url'] = urljoin(base_url, block.css('div.search-result-header > a::attr(href)')[0].extract())
yield response.follow(url=item['url'],callback=self.parse_inside_pupper, meta={"item": item})
After that you can use it inside parse_inside_pupper (and yield it from here):
def parse_inside_pupper(self, response):
item = response.meta["item"]
item['buildY'] = response.xpath('').extract_first().strip()
item['on_m'] = response.xpath('').extract_first().strip()
item['off_m'] = response.xpath('').extract_first().strip()
yield item
I'm trying to extract json data with Scrapy from a website, but i'm facing some issues, like when i run my spider, gives no error and says that crawled 0 pages. I also use the command to store de output to json file to see the output.
The following code is my spider:
import scrapy
class WineSpider(scrapy.Spider):
name = "SpidyWine"
i = 0
url = 'https://maiscarrinho.com/api/search?q=vinho&pageNumber=%s&pageSize=10'
start_urls = [url % 1]
def parse(self, response):
data = json.loads(response.body)
for item in data['results']:
yield {
'Image': item.get('image')
}
if data['Image']:
i = i + 1
yield scrapy.Request(self.url % i, callback=self.parse)
And my class of items:
import scrapy
class MaiscarrinhoItem(scrapy.Item):
image = scrapy.Field()
price = scrapy.Field()
supermarket = scrapy.Field()
promotion = scrapy.Field()
wineName = scrapy.Field()
brand = scrapy.Field()
For now, i'm just using the Image field in my spider to get things more easier.
Also, my ideia when i wrote the if statement in my spider was to 'deal' with the infinite scorlling, when the json api has 'Image' means that that page have content.
Output in Console
Thanks in advance
You did everything right except a very small mistake.
The field name which contains the image is Image and not image
Try :
yield {
'Image': item.get('Image')
}
There is probably something also wrong with your ITEM_PIPELINES in settings.py file
Well answering to my question and after digging into my code after some time... I realized it was about identation errors and some errors of syntaxe.
Another point was the pipeline, i forgot to change de last name to the real name of my pipeline, so instead of having 'Maiscarrinho.pipelines.SomePipeline': 300 now i have 'Maiscarrinho.pipelines.MaiscarrinhoPipeline': 300
The bellow code are extracting the images like i want, but there is one problem yet. Since the page have infinite scrolling i have a condition to evaluate if there is an element named 'Image but for some reason i'm not getting the desired result. It should extract 40 pages each with 10 images.
import scrapy
import json
class WineSpider(scrapy.Spider):
name = "SpidyWine"
url = 'https://maiscarrinho.com/api/search?q=vinho&pageNumber=%s&pageSize=10'
start_urls = [url % 1]
i = 1
def parse(self, response):
data = json.loads(response.body.decode('utf-8'))
for item in data['results']:
yield {
'Image': item.get('Image')
}
if item.get('Image'):
WineSpider.i += 1
yield scrapy.Request(self.url % WineSpider.i, callback=self.parse)
I have a scrapy spider that receives the input of a desired keyword and then yields a search result url. It then crawls that URL to scrape desired values about each car result within 'item'. I am trying to add within my yielded items the url for each full sized car image link that accompanies each car on the vehicle list of results.
The specific url that is being crawled when I enter the keyword as being "honda" is the following:
Honda search results example
I have been having trouble figuring out the correct way to write the xpath and then include whatever list of image url's I acquire into the spider's 'item' I yield at the last part of my code.
Right now when Items is saved to a .csv file with the below lkq.py spider being run with the command "scrapy crawl lkq -o items.csv -t csv" the column of the items.csv file for Picture is just all zeros instead of the image url's.
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import scrapy
from scrapy.shell import inspect_response
from scrapy.utils.response import open_in_browser
keyword = raw_input('Keyword: ')
url = 'http://www.lkqpickyourpart.com/DesktopModules/pyp_vehicleInventory/getVehicleInventory.aspx?store=224&page=0&filter=%s&sp=&cl=&carbuyYardCode=1224&pageSize=1000&language=en-US' % (keyword,)
class Cars(scrapy.Item):
Make = scrapy.Field()
Model = scrapy.Field()
Year = scrapy.Field()
Entered_Yard = scrapy.Field()
Section = scrapy.Field()
Color = scrapy.Field()
Picture = scrapy.Field()
class LkqSpider(scrapy.Spider):
name = "lkq"
allowed_domains = ["lkqpickyourpart.com"]
start_urls = (
url,
)
def parse(self, response):
picture = response.xpath(
'//href=/text()').extract()
section_color = response.xpath(
'//div[#class="pypvi_notes"]/p/text()').extract()
info = response.xpath('//td["pypvi_make"]/text()').extract()
for element in range(0, len(info), 4):
item = Cars()
item["Make"] = info[element]
item["Model"] = info[element + 1]
item["Year"] = info[element + 2]
item["Entered_Yard"] = info[element + 3]
item["Section"] = section_color.pop(
0).replace("Section:", "").strip()
item["Color"] = section_color.pop(0).replace("Color:", "").strip()
item["Picture"] = picture.pop(0).strip()
yield item
I don't really understand why you were using an xpath like '//href=/text()', I would recommend reading some xpath tutorial first, here is a very good one.
If you want to get all the images urls I think this is what you want
pictures = response.xpath('//img/#src').extract()
Now picture.pop(0).strip() will only get you the last of the urls and strip it, remember that .extract() returns a list, so pictures now contains all the image links, just choose there which ones you need.