i tried to create a scrapy spider to download some json-files from a site -
This is my scrapy spider:
(first tested the spider - so it only outputs the link to the json-file which works fine - see commented code below)
But i want to download the json-files to a folder on my pc.
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = tmpDownloadLink
yield item
# yield {
# "link": tmpDownloadLink,
# }
And this are the changes which i did in the settings.py:
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1,
}
IMAGES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
But unfortunately the download of the json-files is not working.
How can i download the json-files to the defined folder?
You have two problems.
item['file_urls'] should be a list.
IMAGES_STORE should be FILES_STORE
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l,
callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2,
callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
# yield {
# "link": tmpDownloadLink,
# }
EDIT:
In order to set the file's name do this:
settings.py:
ITEM_PIPELINES = {
'yourprojectname.pipelines.ProcessPipeline': 1,
}
FILES_STORE = r'C:\DOWNLOAD\DATASETS\Spanish'
pipelines.py:
import os
from urllib.parse import unquote
from scrapy.pipelines.files import FilesPipeline
class ProcessPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, *, item=None):
file_name = os.path.basename(unquote(request.url))
return file_name
EDIT 2:
writing additional information to file:
import json
import scrapy
class spiderWords(scrapy.Spider):
name = 'spiderWords'
allowed_domains = ['kaikki.org']
start_urls = ['https://kaikki.org/dictionary/Spanish/words.html']
erg = {}
def parse(self, response):
tmpLinks = response.xpath("(//ul)[2]/li/a/#href").getall()
for l in tmpLinks:
l = response.urljoin(l)
request = scrapy.Request(l, callback=self.parseDetails)
yield request
def parseDetails(self, response):
tmpLinks2 = response.xpath("(//ul)[2]/li/a/#href").getall()
for l2 in tmpLinks2:
l2 = response.urljoin(l2)
request = scrapy.Request(l2, callback=self.parseDownload)
yield request
def parseDownload(self, response):
class DownfilesItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field()
key = response.xpath('//ul[#class="breadcrumb"]/li[last()]/text()').get()
self.erg[key] = response.url
tmpDownloadLink = response.xpath("//p[contains(text(), 'JSON')]/a/#href").get()
tmpDownloadLink = response.urljoin(tmpDownloadLink)
item = DownfilesItem()
item['file_urls'] = [tmpDownloadLink]
yield item
def close(self, reason):
with open('erg.json', 'w') as f:
f.write(json.dumps(self.erg, indent=4))
Related
I have a problem with my script such that the same file name, and pdf is downloading. I have checked the output of my results without downloadfile and I get unique data. It's when I use the pipeline that it somehow produces duplicates for download.
Here's my script:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
loader = fcpItem()
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader
items.py
class fcpItem(scrapy.Item):
names = Field()
pdfs = Field()
results = Field()
pipelines.py
class DownfilesPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None, item=None):
items = item['names']+'.pdf'
return items
settings.py
from pathlib import Path
import os
BASE_DIR = Path(__file__).resolve().parent.parent
FILES_STORE = os.path.join(BASE_DIR, 'fsc')
ROBOTSTXT_OBEY = False
FILES_URLS_FIELD = 'pdfs'
FILES_RESULT_FIELD = 'results'
ITEM_PIPELINES = {
'environment.pipelines.pipelines.DownfilesPipeline': 150
}
I am using css instead of xpath.
From the chrome debug panel, the tag is root of item of PDF list.
Under that div tag has title of PDF and tag for file download URL
Between root tag and tag two child's and sibling relation so xpath is not clean method and hard, a css much better is can easley pick up from root to . it don't necessary relation ship path. css can skip relationship and just sub/or grand sub is not matter. It also provides not necessary to consider index problem which is URL array and title array sync by index match.
Other key point are URL path decoding and file_urls needs to set array type even if single item.
fsc_spider.py
import scrapy
import urllib.parse
from quotes.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = [
'https://fsc.org/en/members',
]
def parse(self, response):
for book in response.css('div.field__item.resource-item'):
url = urllib.parse.unquote(book.css('div.actions a::attr(href)').get(), encoding='utf-8', errors='replace')
url_left = url[0:url.rfind('/')]+'/'
title = book.css('span.media-caption.file-caption::text').get()
item = fcpItem()
item['original_file_name'] = title.replace(' ','_')
item['file_urls'] = ['https://fsc.org'+url_left+title.replace(' ','%20')]
yield item
items.py
import scrapy
class fcpItem(scrapy.Item):
file_urls = scrapy.Field()
files = scrapy.Field
original_file_name = scrapy.Field()
pipelines.py
import scrapy
from scrapy.pipelines.files import FilesPipeline
class fscPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
file_name: str = request.url.split("/")[-1].replace('%20','_')
return file_name
settings.py
BOT_NAME = 'quotes'
FILES_STORE = 'downloads'
SPIDER_MODULES = ['quotes.spiders']
NEWSPIDER_MODULE = 'quotes.spiders'
FEED_EXPORT_ENCODING = 'utf-8'
ROBOTSTXT_OBEY = True
ITEM_PIPELINES = { 'quotes.pipelines.fscPipeline': 1}
file structure
execution
quotes>scrapy crawl fsc
result
The problem is that you are overwriting the same scrapy item every iteration.
What you need to do is create a new item for each time your parse method yields. I have tested this and confirmed that it does produce the results you desire.
I made and inline not in my example below on the line that needs to be changed.
For example:
import scrapy
from environment.items import fcpItem
class fscSpider(scrapy.Spider):
name = 'fsc'
start_urls = ['https://fsc.org/en/members']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url,
callback = self.parse
)
def parse(self, response):
content = response.xpath("(//div[#class='content__wrapper field field--name-field-content field--type-entity-reference-revisions field--label-hidden field__items']/div[#class='content__item even field__item'])[position() >1]")
names_add = response.xpath(".//div[#class = 'field__item resource-item']/article//span[#class='media-caption file-caption']/text()").getall()
url = response.xpath(".//div[#class = 'field__item resource-item']/article/div[#class='actions']/a//#href").getall()
pdf=[response.urljoin(x) for x in url if '#' is not x]
names = [x.split(' ')[0] for x in names_add]
for nm, pd in zip(names, pdf):
loader = fcpItem() # Here you create a new item each iteration
loader['names'] = nm
loader['pdfs'] = [pd]
yield loader
I am running the scrapy spider on airbnb for academic purposes below. I scrape all listings first
(such as: https://www.airbnb.com/s/Berlin--Germany/homes?tab_id=all_tab&query=Berlin%2C%20Germany&place_id=ChIJAVkDPzdOqEcRcDteW0YgIQQ&checkin=2020-05-01&adults=1&refinement_paths%5B%5D=%2Fhomes&source=structured_search_input_header&search_type=search_query&checkout=2020-05-02)
to get their ids and then go to the listing's page
(such as: https://www.airbnb.de/rooms/20839690?location=Berlin&check_in=2020-05-01&check_out=2020-05-02&adults=1)
and get the geo-data from the details JSON. Ideally, I would like to have a final JSON nested like:
{{'ID': ID1, 'Title': Title1, 'Latitude': Lat1},{'ID': ID2, 'Title': Title2, 'Latitude': Lat2}}
Because of the recursive structure, I have the full list of title, price etc. already in the first go, while lng and lat are only one element per loop run.
{{Price1, Price2, Price3..., id1, id2...lng1, lat1}, {Price1, Price2, Price3..., id1, id2..., lng2, lat2}}
Any idea how I can restructure the code to get the above structure?
Cheers
marcello
Spider:
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Spider
from scrapy_splash import SplashRequest
from airbnb.items import AirbnbItem
import json
import pprint
all_ids = []
detail = {}
class AirbnbSpider(scrapy.Spider):
name = 'airbnb_spider'
allowed_domains = ['airbnb.com', 'airbnb.de']
start_urls = ['https://www.airbnb.de/s/Berlin/homes?checkin=2020-05-01&checkout=2020-05-02&adults=1']
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').extract()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').extract()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').extract()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').extract()
x_id = [i.split('_')[1] for i in detail['id']]
detail['id'] = x_id
for i in x_id:
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+i+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link, callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
return detail
Items
import scrapy
class AirbnbItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
price = scrapy.Field()
id = scrapy.Field()
rating = scrapy.Field()
lat = scrapy.Field()
lng = scrapy.Field()
pass
You can pass information to the to the parse_detail method and yield from there
def parse(self, response):
item = AirbnbItem()
for listing in response.xpath('//div[#class = "_fhph4u"]'):
detail["title"] = listing.xpath('//a[#class = "_i24ijs"]/#aria-label').get()
detail["price"] = listing.xpath('//span[#class = "_1p7iugi"]/text()').get()
detail["rating"] = listing.xpath('//span[#class = "_3zgr580"]/text()').get()
detail["id"] = listing.xpath('//a[#class = "_i24ijs"]/#target').get()
#item["link"] = listing.xpath('//a[#class = "_i24ijs"]/#href').get()
detail['id'] = detail['id'].split('_')[1]
link = 'https://www.airbnb.de/api/v2/pdp_listing_details/'+detail['id']+'?_format=for_rooms_show&_p3_impression_id=p3_1587291065_1e%2FBlC2IefkrfTQe&adults=1&check_in=2020-05-01&check_out=2020-05-02&key=d306zoyjsyarp7ifhu67rjxn52tv0t20&'
yield scrapy.Request(url = link,
meta={'item': detail}, #pass information to the next method
callback =self.parse_detail)
def parse_detail(self, response):
jsonresponse = json.loads(response.body_as_unicode())
detail = response.meta['item']
detail["lat"] = jsonresponse["pdp_listing_detail"]["lat"]
detail["lng"] = jsonresponse["pdp_listing_detail"]["lng"]
yield detail
BTW, Item class is useless, do not use it.
recently had this project of crawling google play store apps, for vietnam region, and realized that the request doesn't run the callback function for all URLs that haven been return.
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http.request import Request
from urllib.parse import urlparse, parse_qsl, urlencode
import scrapy
class GooglePlayStoreSpider(CrawlSpider):
name = 'google_play'
allowed_domains = ['play.google.com']
start_urls = ['http://play.google.com']
rules = (
Rule(LinkExtractor(allow=('https://play.google.com/store/apps/details')), follow=True,
process_links='process_links',
callback='parse_1'),
)
crawled_ids = []
first_init = False
def parse_start_url(self, response):
# print("-------------- PRINTING SECTION START_URL --------------")
if not self.first_init:
self.first_init = True
extractor = LinkExtractor(allow=('/store/apps/category/.*',))
raw_links = extractor.extract_links(response)
links = self.process_links(raw_links)
return [
scrapy.Request('{}'.format(link.url))
for link in links
]
else:
# print("============ START_URL ELSE PART ============")
pass
def process_links(self, links):
new_links = []
for link in links:
old_url = link.url
if not old_url.startswith('https://play.google.com/store/apps/'):
continue
old_url_obj = urlparse(old_url)
old_url_query = dict(parse_qsl(old_url_obj.query))
if old_url_obj.path == '/store/apps/details':
if old_url_query['id'] in self.crawled_ids:
continue
else:
self.crawled_ids.append(old_url_query['id'])
old_url_query['hl'] = 'en'
old_url_query['gl'] = 'vn'
link.url = '{}://{}{}?{}'.format(old_url_obj.scheme, old_url_obj.netloc, old_url_obj.path,
urlencode(old_url_query))
new_links.append(link)
# print("LINKKSSS ====", links)
# print("NEW_LINKKSSS ====", new_links)
# print("-------------- PRINTING SECTION PROCESS_LINKS --------------")
return new_links
def parse_1(self, response):
selector = scrapy.Selector(response)
urls = selector.xpath('//a[#class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/#href').extract()
links = []
for url in urls:
if not url.startswith('https://play.google.com/'):
url = "https://play.google.com" + url
links.append(url)
link_flag = 0
for url in urls:
# yield links_list.append(scrapy.Request(url, callback=self.parse_next, dont_filter=True))
yield Request(links[link_flag], callback=self.parse_next, dont_filter=True)
link_flag += 1
def parse_next(self, response):
# print("PARSE_NEXT ===========", response.request.url)
selector = scrapy.Selector(response)
app_urls = selector.xpath('//div[#class="details"]/a[#class="title"]/#href').extract()
urls = []
for url in app_urls:
url = "https://play.google.com" + url + '&hl=en&gl=vn'
urls.append(url)
url_list = []
link_flag = 0
for url in app_urls:
yield Request(urls[link_flag], callback=self.parse_detail, dont_filter=True)
link_flag += 1
# return url_list
def parse_detail(self, response):
print("Parsed ======= ", response.request.url)
item = dict()
item['name'] = response.xpath('//div[#itemscope]//meta[#itemprop="name"]/#content').extract_first()
item['category'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="applicationCategory"]/#content').extract_first()
item['review_score'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="ratingValue"]/#content').extract_first()
item['review_count'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="reviewCount"]/#content').extract_first()
item['link'] = response.request.url
item['id'] = dict(parse_qsl(urlparse(response.request.url).query))['id']
item['content_rating'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="contentRating"]/#content').extract_first()
item['image'] = response.xpath('//div[#itemscope]//meta[#itemprop="image"]/#content').extract_first()
item['price'] = response.xpath('//div[#itemscope]//meta[#itemprop="price"]/#content').extract_first()
item['price_currency'] = response.xpath(
'//div[#itemscope]//meta[#itemprop="priceCurrency"]/#content').extract_first()
# item['operating_system'] = response.xpath('//div[#itemscope]//meta[#itemprop="operatingSystem"]/#content').extract_first()
return item
As i run into the terminal, it says that it crawled 100 pages and scraped only 15 pages, (numbers are for estimate).
Please help
I am new to Scrapy . I am trying to download files using media pipeline. But when I am running spider no files are stored in the folder.
spider:
import scrapy
from scrapy import Request
from pagalworld.items import PagalworldItem
class JobsSpider(scrapy.Spider):
name = "songs"
allowed_domains = ["pagalworld.me"]
start_urls =['https://pagalworld.me/category/11598/Latest%20Bollywood%20Hindi%20Mp3%20Songs%20-%202017.html']
def parse(self, response):
urls = response.xpath('//div[#class="pageLinkList"]/ul/li/a/#href').extract()
for link in urls:
yield Request(link, callback=self.parse_page, )
def parse_page(self, response):
songName=response.xpath('//li/b/a/#href').extract()
for song in songName:
yield Request(song,callback=self.parsing_link)
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield{"download_link":item['file_urls']}
Item file:
import scrapy
class PagalworldItem(scrapy.Item):
file_urls=scrapy.Field()
Settings File:
BOT_NAME = 'pagalworld'
SPIDER_MODULES = ['pagalworld.spiders']
NEWSPIDER_MODULE = 'pagalworld.spiders'
ROBOTSTXT_OBEY = True
CONCURRENT_REQUESTS = 5
DOWNLOAD_DELAY = 3
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline': 1
}
FILES_STORE = '/tmp/media/'
The output looks like this:
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield{"download_link":item['file_urls']}
You are yielding:
yield {"download_link": ['http://someurl.com']}
where for scrapy's Media/File pipeline to work you need to yield and item that contains file_urls field. So try this instead:
def parsing_link(self,response):
item= PagalworldItem()
item['file_urls']=response.xpath('//div[#class="menu_row"]/a[#class="touch"]/#href').extract()
yield item
class AljazeeraSpider(XMLFeedSpider):
name = "aljazeera"
allowed_domains = ["aljazeera.com"]
start_urls = [
'http://www.aljazeera.com/',
]
def parse(self, response):
hxs = HtmlXPathSelector(response) # The xPath selector
titles = hxs.select('//div[contains(#class,"SkyScrapperBoxes")]/div[contains(#class,"skyscLines")]')
if not titles:
MailNotify().send_mail("Aljazeera", "Scraper Report")
items = []
for titles in titles:
item = NewsItem()
item['title'] = escape(''.join(titles.select('a/text()').extract()))
item['link'] = "http://www.aljazeera.com" + escape(''.join(titles.select('a/#href').extract()))
item['description'] = ''
item = Request(item['link'], meta={'item': item}, callback=self.parse_detail)
items.append(item)
return items
def parse_detail(self, response):
item = response.meta['item']
sel = HtmlXPathSelector(response)
detail = sel.select('//td[#class = "DetailedSummary"]')
item['details'] = remove_html_tags(escape(''.join(detail.select('p').extract())))
item['location'] = ''
published_date = sel.select('//span[#id = "ctl00_cphBody_lblDate"]')
item['published_date'] = escape(''.join(published_date.select('text()').extract()))
return item
I am currently working on Scrapy to crawl the website. I have some knowledge about unittest in python. But,How can I write the unittest to check that link is working, and item['location'], item['details'] are returning the value or not? I have learned Scrapy contract but cannot understand anything.So, how can write the unittest in this case?
If we are talking specifically about how to test the spiders (not pipelines, or loaders), then what we did is provided a "fake response" from a local HTML file. Sample code:
import os
from scrapy.http import Request, TextResponse
def fake_response(file_name=None, url=None):
"""Create a Scrapy fake HTTP response from a HTML file"""
if not url:
url = 'http://www.example.com'
request = Request(url=url)
if file_name:
if not file_name[0] == '/':
responses_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(responses_dir, file_name)
else:
file_path = file_name
file_content = open(file_path, 'r').read()
else:
file_content = ''
response = TextResponse(url=url, request=request, body=file_content,
encoding='utf-8')
return response
Then, in your TestCase class, call the fake_response() function and feed the response to the parse() callback:
from unittest.case import TestCase
class MyTestCase(TestCase):
def setUp(self):
self.spider = MySpider()
def test_parse(self):
response = fake_response('input.html')
item = self.spider.parse(response)
self.assertEqual(item['title'], 'My Title')
# ...
Aside from that, you should definitely start using Item Loaders with input and output processors - this would help to achieve a better modularity and, hence, isolation - spider would just yield item instances, data preparation and modification would be incapsulated inside the loader, which you would test separately.