Scrapy - Scrape both text and images in the same spider - python

Scrapy super noob here. Problem: I have an html page that contains both information that I want to scrape and an url that I want to follow to get images urls for images that I want to download and save via the scrapy image pipeline.
My approach to achieve this:
1. Scrape all the details as usual with a parse method
2. Find the url in the initial page, create a request that has a second parse method as callback where I build the image_urls list.
So, I have the following setup:
settings.py
...
ITEM_PIPELINES = {
'crawlbot.pipelines.MybotPipeline': 300,
'scrapy.pipelines.images.ImagesPipeline': 1,
}
IMAGES_STORE = '/url/to/images' #valid path to actual folder
...
pipelines.py
import pymongo
class MybotPipeline(object):
def __init__(self):
self.conn = pymongo.MongoClient('localhost', 27017)
db = self.conn['libraries']
self.collection = db['books']
def process_item(self, item, spider):
self.collection.insert(dict(item))
return item
items.py
import scrapy
class MybotItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
images = scrapy.Field()
image_urls = scrapy.Field()
description = scrapy.Field()
crawler.py
import scrapy
from scrapy.spiders import CrawlSpider
class MySpider(CrawlSpider):
name = 'myspider'
allowed_domains = ['books.com']
def start_requests(self):
urls = [
'https://www.books.com/some/url'
]
custom_settings = {
'DEPTH_LIMIT': 1
}
for url in urls:
yield scrapy.Request(url=url, callback=self.parse_item)
def parse_details(self, response):
for image in enumerate(response.xpath('//div[contains(#class, "jumbotron")]/div')):
image_urls = image.xpath('div[contains(#class, "jumbotron-image")]/img/#src').getall()
def parse_item(self, response):
for idx, list_item in enumerate(response.xpath('//div[contains(#class, "slider-wrapper")]')):
anchor = list_item.xpath('div[contains(#class, "slider-section")]/div/a')
slider_thumbnail = anchor.xpath('div[contains(#class, "slider-thumbnail")]')
description = slider_thumbnail.xpath('div[contains(#class, "description-box")]')
yield {
'url': anchor.xpath('#href').get(),
'description': description
}
details_page_urls = anchor.xpath('#href').getall()
for details_page in details_page_urls:
yield scrapy.Request(url=details_page, callback=self.parse_details)
This is not working, although with my little knowledge of both Scrapy and Python, the second parse method should return a list of image_urls. So I have 2 questions: 1. is there a better approach for my case? Maybe the whole issue is in trying to do too much with one spider? 2. If the approach is ok, what am I doing wrong?

Related

How to download images without error in Scrapy?

I am newbie to scrapy. I am trying to download an image.
import scrapy
from scrapy.http import Request
class PlayerSpider(scrapy.Spider):
name = 'player'
#allowed_domains = ['nba.com/players']
start_urls = ['http://www.nba.com/players/']
def parse(self, response):
Player_Name = response.css('div#content.nba-player-index a ::attr(title)').extract()
Player_link = response.css('.nba-player-index__trending-item a::attr(href)').extract()
links = [url for url in Player_link if url.startswith("/players")]
for link in links:
absolute_url = response.urljoin(link)
yield Request(absolute_url, callback=self.parse_players)
def parse_players(self, response):
Player_Name = response.css('section.nba-player-header__details-bottom ::text').extract()
items=[]
for images in Player_Name:
item = PlayerSpider()
images_link = response.css('section.nba-detail-header-wrapper .nba-player-header__headshot img::attr(src)').extract_first()
image_urls = 'http:{}'.format(images_link)
item[image_urls]
return item
Player_Height = response.css('section.nba-player-vitals__top-left.small-6 p.nba-player-vitals__top-info-imperial ::text').extract()
Player_Weight = response.css('section.nba-player-vitals__top-right.small-6 p.nba-player-vitals__top-info-imperial ::text').extract()
yield {
'Player_name' : Player_Name,
'Player_Height' : Player_Height,
'Player Weight' : Player_Weight
}
I think files are good. But I am unable to write correct spider for getting the image. I am able to grab the image URL but don't know how to store the image using imagePipeline.
items.py
import scrapy
from scrapy.item import Item
class PlayerSpider(scrapy.Item):
image_url = scrapy.Field()
images = scrapy.Field()
pass
To enable your image pipeline you must first add it to your project ITEM_PIPELINES setting.
settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_STORE = 'images'
link

Scrapy Media Pipeline ,files not downloading

I am new to Scrapy . I am trying to download files using media pipeline. But when I am running spider no files are stored in the folder.
spider:
import scrapy
from scrapy import Request
from pagalworld.items import PagalworldItem
class JobsSpider(scrapy.Spider):
name = "songs"
allowed_domains = ["pagalworld.me"]
start_urls =['https://pagalworld.me/category/11598/Latest%20Bollywood%20Hindi%20Mp3%20Songs%20-%202017.html']
def parse(self, response):
urls = response.xpath('//div[#class="pageLinkList"]/ul/li/a/#href').extract()
for link in urls:
yield Request(link, callback=self.parse_page, )
def parse_page(self, response):
songName=response.xpath('//li/b/a/#href').extract()
for song in songName:
yield Request(song,callback=self.parsing_link)
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield{"download_link":item['file_urls']}
Item file:
import scrapy
class PagalworldItem(scrapy.Item):
file_urls=scrapy.Field()
Settings File:
BOT_NAME = 'pagalworld'
SPIDER_MODULES = ['pagalworld.spiders']
NEWSPIDER_MODULE = 'pagalworld.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 5
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1
}
FILES_STORE = '/tmp/media/'
The output looks like this:
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield{"download_link":item['file_urls']}
You are yielding:
yield {"download_link": ['http://someurl.com']}
where for scrapy's Media/File pipeline to work you need to yield and item that contains file_urls field. So try this instead:
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield item

Scraping interactive website

I'm trying to scrap name of the course with number of students from Udacity to find out which courses are the most popular. I manage to create code for item:
import scrapy
class UdacityItem(scrapy.Item):
name=scrapy.Field()
users=scrapy.Field()
and spider:
import scrapy
from Udacity.items import UdacityItem
import re
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t=UdacityItem()
#name & url
t['name']=s.xpath('text()').extract()[0].strip()
url=response.urljoin(s.xpath('#href').extract()[0])
#request
req=scrapy.Request(url, callback=self.second)
req.meta['item']=t
#execute
yield req
def second(self,response):
t=response.meta['item']
strong =response.xpath('//strong[#data-course-student-count]/text()').extract()[0]
t['users']=strong
yield t
As a result I'm getting name of the course but instead of the number of students I am getting text 'thousands of'. When I open an example website in browser I see that 'thousands of' is the base value and later (in 1-2 sec) this text is changing into a proper number(which I want to get).
And here are my questions:
Why this replacement is happening? Is this JavaScript code? I would
like to understand mechanism of this change.
How I can capture proper number of students using scrapy? I hope this is possible.
Thank you in advance for help with that.
To get the enrollments count, you would have to simulate the API request to https://www.udacity.com/api/summaries endpoint for a specific course id, which can be extracted from the URL itself - for example, it is ud898 for the https://www.udacity.com/course/javascript-promises--ud898 URL.
Complete spider:
import json
import re
from urllib import quote_plus
import scrapy
class UdacityItem(scrapy.Item):
name = scrapy.Field()
users = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "UdSpider"
allowed_domains = ["udacity.com"]
start_urls = ["https://www.udacity.com/courses/all"]
def parse(self, response):
sites = response.xpath('//h3/a')
for s in sites:
t = UdacityItem()
# name & url
t['name'] = s.xpath('text()').extract()[0].strip()
url = response.urljoin(s.xpath('#href').extract()[0])
# request
req = scrapy.Request(url, callback=self.second)
req.meta['item'] = t
# execute
yield req
def second(self, response):
queries = [{
"limit": 1,
"model": "CourseStudentsSummary",
"locator": {
"sample_frequency": "daily",
"content_context": [{
"node_key": re.search(r'--(.*?)$', response.url).group(1)
}]
}
}]
yield scrapy.Request(method="GET",
url="https://www.udacity.com/api/summaries?queries=" + quote_plus(json.dumps(queries)),
callback=self.parse_totals)
def parse_totals(self, response):
print(json.loads(response.body[5:].strip())["summaries"]["default"][0]["data"]["total_enrollments"])

Limit how much elements scrapy can collect

I am using scrapy to collect some data. My scrapy program collects 100 elements at one session. I need to limit it to 50 or any random number. How can i do that? Any solution is welcomed. Thanks in advance
# -*- coding: utf-8 -*-
import re
import scrapy
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
title = scrapy.Field()
tag = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["raleigh.craigslist.org"]
start_urls = [
"http://raleigh.craigslist.org/search/bab"
]
BASE_URL = 'http://raleigh.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
match = re.search(r"(\w+)\.html", response.url)
if match:
item_id = match.group(1)
url = self.BASE_URL + "reply/ral/bab/" + item_id
item = DmozItem()
item["link"] = response.url
item["title"] = "".join(response.xpath("//span[#class='postingtitletext']//text()").extract())
item["tag"] = "".join(response.xpath("//p[#class='attrgroup']/span/b/text()").extract()[0])
return scrapy.Request(url, meta={'item': item}, callback=self.parse_contact)
def parse_contact(self, response):
item = response.meta['item']
item["attr"] = "".join(response.xpath("//div[#class='anonemail']//text()").extract())
return item
This is what CloseSpider extension and CLOSESPIDER_ITEMCOUNT setting were made for:
An integer which specifies a number of items. If the spider scrapes
more than that amount if items and those items are passed by the item
pipeline, the spider will be closed with the reason
closespider_itemcount. If zero (or non set), spiders won’t be closed
by number of passed items.
I tried alecxe answer but I had to combine all 3 limits to make it work, so leaving it here just in case someone else is having the same issue:
class GenericWebsiteSpider(scrapy.Spider):
"""This generic website spider extracts text from websites"""
name = "generic_website"
custom_settings = {
'CLOSESPIDER_PAGECOUNT': 15,
'CONCURRENT_REQUESTS': 15,
'CLOSESPIDER_ITEMCOUNT': 15
}
...

Make Scrapy follow links and collect data

I am trying to write program in Scrapy to open links and collect data from this tag: <p class="attrgroup"></p>.
I've managed to make Scrapy collect all the links from given URL but not to follow them. Any help is very appreciated.
You need to yield Request instances for the links to follow, assign a callback and extract the text of the desired p element in the callback:
# -*- coding: utf-8 -*-
import scrapy
# item class included here
class DmozItem(scrapy.Item):
# define the fields for your item here like:
link = scrapy.Field()
attr = scrapy.Field()
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["craigslist.org"]
start_urls = [
"http://chicago.craigslist.org/search/emd?"
]
BASE_URL = 'http://chicago.craigslist.org/'
def parse(self, response):
links = response.xpath('//a[#class="hdrlnk"]/#href').extract()
for link in links:
absolute_url = self.BASE_URL + link
yield scrapy.Request(absolute_url, callback=self.parse_attr)
def parse_attr(self, response):
item = DmozItem()
item["link"] = response.url
item["attr"] = "".join(response.xpath("//p[#class='attrgroup']//text()").extract())
return item

Categories