scrapy - download with correct extension

scrapy - download with correct extension - python

I have a following spider:
class Downloader(scrapy.Spider):
name = "sor_spider"
download_folder = FOLDER
def get_links(self):
df = pd.read_excel(LIST)
return df["Value"].loc
def start_requests(self):
urls = self.get_links()
for url in urls.iteritems():
index = {"index" : url[0]}
yield scrapy.Request(url=url[1], callback=self.download_file, errback=self.errback_httpbin, meta=index, dont_filter=True)
def download_file(self, response):
url = response.url
index = response.meta["index"]
content_type = response.headers['Content-Type']
download_path = os.path.join(self.download_folder, r"{}".format(str(index)))
with open(download_path, "wb") as f:
f.write(response.body)
yield LinkCheckerItem(index=response.meta["index"], url=url, code="downloaded")
def errback_httpbin(self, failure):
yield LinkCheckerItem(index=failure.request.meta["index"], url=failure.request.url, code="error")
It should:
read excel with links (LIST)
go to each link and download file to the FOLDER
log results in LinkCheckerItem(I am exporting it to csv)
That would normally work fine but my list contains files of different types - zip, pdf, doc etc.
These are the examples of links in my LIST:
https://disclosure.1prime.ru/Portal/GetDocument.aspx?emId=7805019624&docId=2c5fb68702294531afd03041e877ca84
http://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=1173293
http://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=1263289
https://disclosure.1prime.ru/Portal/GetDocument.aspx?emId=7805019624&docId=eb9f06d2b837401eba9c66c8bf5be813
http://e-disclosure.ru/portal/FileLoad.ashx?Fileid=952317
http://e-disclosure.ru/portal/FileLoad.ashx?Fileid=1042224
https://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=1160005
https://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=925955
https://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=1166563
http://npoimpuls.ru/templates/npoimpuls/material/documents/%D0%A1%D0%BF%D0%B8%D1%81%D0%BE%D0%BA%20%D0%B0%D1%84%D1%84%D0%B8%D0%BB%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%BD%D1%8B%D1%85%20%D0%BB%D0%B8%D1%86%20%D0%BD%D0%B0%2030.06.2016.pdf
http://нпоимпульс.рф/assets/download/sal30.09.2017.pdf
http://www.e-disclosure.ru/portal/FileLoad.ashx?Fileid=1166287
I would like it to save file with its original extension, whatever it is... Just like my browser when it opens an alert to save file.
I tried to use response.headers["Content-type"] to find out the type but in this case it's always application/octet-stream .
How could I do it?

You need to parse Content-Disposition header for the correct file name.

Related

Databricks web scraping

I wrote some python code to automatically download a csv file from the internet. The code works when it runs on my local computer but not when I run it on DataBricks. The problem is that I don't know how to save it to my DBFS: folder = "/dbfs/mnt/fmi-import/DNB Scenariosets/". The code does execute but the file is nowhere to be found.
import requests
from bs4 import BeautifulSoup
import re
url_scenariosets_dnb = 'https://www.dnb.nl/voor-de-sector/open-boek-toezicht-sectoren/pensioenfondsen/haalbaarheidstoets/uitvoering-en-normen/scenarioset-haalbaarheidstoets-pensioenfondsen/'
folder = "/dbfs/mnt/fmi-import/DNB Scenariosets/"
class dataset_downloader:
def __init__(self,url):
self.url=url
def scrape(self):
reqs = requests.get(self.url, verify=False)
soup = BeautifulSoup(reqs.text, 'html.parser')
self.urls=[]
for link in soup.find_all('a'):
self.urls.append(link.get('href'))
return self.urls
def filter_scenarioset(self):
# Search data based on regular expression in the list
self.scenarioset_links=[]
[self.scenarioset_links.append('https://www.dnb.nl'+val) for val in self.urls
if re.search(r'hbt-scenarioset-10k', val)]
return self.scenarioset_links
def download_file(self, year, quarter):
try:
self.downloadlink=[]
[self.downloadlink.append(val) for val in self.scenarioset_links
if re.search(r'hbt-scenarioset-10k-{}q{}'.format(year,quarter),val)]
filename='hbt-scenarioset-10k-{}q{}.xlsx'.format(year,quarter)
with requests.get(self.downloadlink[0]) as req:
with open(filename, 'wb') as f:
for chunk in req.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return "/dbfs/mnt/fmi-import/DNB Scenariosets/"+filename
except Exception as e:
print(e)
return None
#%% EXECUTE
download = dataset_downloader(url_scenariosets_dnb)
download.scrape()
download.filter_scenarioset()
download.download_file(2020,2) # select year and quarter
Do you have any suggestion on how you can download a csv file with databricks and save it to a DBFS folder? Thank you in advance!
Vincent

Problem is that in code you have only filename without path to folder. It should be:
with open(folder + filename, 'wb')

Why does my PyCharm give a warning regarding this scrapy's function signature?

import scrapy
class WikiSpider(scrapy.Spider):
name = "wiki_spider"
def start_requests(self):
urls = [
'https://zh.wikipedia.org/wiki/Category:愤怒的小鸟系列电子游戏',
]
for url in urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
page = response.url.split("/")[-2]
filename = f'wiki-{page}.html'
with open(filename, 'wb') as f:
f.write(response.body)
self.log(f'Saved file {filename}')
My code above is a copy of this tutorial:
https://docs.scrapy.org/en/latest/intro/tutorial.html
But why does my Pycharm give a warning regarding the parse function's signature?
Signature of method 'WikiSpider.parse()' does not match signature of base method in class 'Spider'
Is this a false alarm?

Because the parse method inside of class WikiSpider(scrapy.Spider) also has another parameter **kwargs. I don't think this causes any errors while running the program, but adding **kwargs does remove the warning.

Change twitter banner from url

How would I go by changing the twitter banner using an image from url using tweepy library: https://github.com/tweepy/tweepy/blob/v2.3.0/tweepy/api.py#L392
So far I got this and it returns:
def banner(self):
url = 'https://blog.snappa.com/wp-content/uploads/2019/01/Twitter-Header-Size.png'
file = requests.get(url)
self.api.update_profile_banner(filename=file.content)
ValueError: stat: embedded null character in path
It seems like filename requires an image to be downloaded. Anyway to process this without downloading the image and then removing it?

Looking at library's code you can do what you want.
def update_profile_banner(self, filename, *args, **kargs):
f = kargs.pop('file', None)
So what you need to do is supply the filename and the file kwarg:
filename = url.split('/')[-1]
self.api.update_profile_banner(filename, file=file.content)

import tempfile
def banner():
url = 'file_url'
file = requests.get(url)
temp = tempfile.NamedTemporaryFile(suffix=".png")
try:
temp.write(file.content)
self.api.update_profile_banner(filename=temp.name)
finally:
temp.close()

How to return multiple files in HttpResponse Django

I have been wracking my brains in this problem. Is there a way in django to serve multiple files from a single HttpResponse?
I have a scenario where i am looping through a list of json and want to return all those as file form my admin view.
class CompanyAdmin(admin.ModelAdmin):
form = CompanyAdminForm
actions = ['export_company_setup']
def export_company_setup(self, request, queryset):
update_count = 0
error_count = 0
company_json_list = []
response_file_list = []
for pb in queryset.all():
try:
# get_company_json_data takes id and returns json for the company.
company_json_list.append(get_company_json_data(pb.pk))
update_count += 1
except:
error_count += 1
# TODO: Get multiple json files from here.
for company in company_json_list:
response = HttpResponse(json.dumps(company), content_type="application/json")
response['Content-Disposition'] = 'attachment; filename=%s.json' % company['name']
return response
#self.message_user(request,("%s company setup extracted and %s company setup extraction failed" % (update_count, error_count)))
#return response
Now this will only let me return/download one json file as return would break the loop. Is there a simpler way to just append all this in a single response object and return that outside loop and download all json in the list in multiple files?
I looked through a way to wrap all these files into a zip file, but i failed to do so as all the examples that i could find had files with path and name which i don't really have in this case.
UPDATE:
I tried to integrate zartch's solution to get a zip file using following:
import StringIO, zipfile
outfile = StringIO.StringIO()
with zipfile.ZipFile(outfile, 'w') as zf:
for company in company_json_list:
zf.writestr("{}.json".format(company['name']), json.dumps(company))
response = HttpResponse(outfile.getvalue(), content_type="application/octet-stream")
response['Content-Disposition'] = 'attachment; filename=%s.zip' % 'company_list'
return response
Since i never had the files to begin with, i thought about just using json dump that i had and adding individual filename. This just creates an empty zipfile. Which i think is expected as i am sure zf.writestr("{}.json".format(company['name']), json.dumps(company)) is not the way to do it. I would appreciate if anyone can help me with this.

Maybe if you try to pack all files in one zip you can archive this in Admin
Something like:
def zipFiles(files):
outfile = StringIO() # io.BytesIO() for python 3
with zipfile.ZipFile(outfile, 'w') as zf:
for n, f in enumerate(files):
zf.writestr("{}.csv".format(n), f.getvalue())
return outfile.getvalue()
zipped_file = zip_files(myfiles)
response = HttpResponse(zipped_file, content_type='application/octet-stream')
response['Content-Disposition'] = 'attachment; filename=my_file.zip'

I had meet this demand.my solution is using html href and javascript
use server to generate list of download file list
<a href="http://a.json" download='a.json'></a>
<a href="http://b.json" download='b.json'></a>
<a href="http://c.json" download='c.json'></a>
<a href="http://d.json" download='d.json'></a>
<a href="http://e.json" download='e.json'></a>
<script>
//simulate click to trigger download action
document.querySelector('a').forEach( aTag => aTag.click());
</script>

Taking screenshots from multiple urls in a file

I'm trying to take screenshots of pages for which I have URLs in a .txt file. When I tried just a single URL such as "http://testing.com" it works, but when I assign it to a variable instead of directly using a string it's not working. Here's the code:
def capture(self, url, output_file):
self.load(QUrl(url))
self.wait_load()
file_list = open("LiveSite.txt")
for site in file_list.readlines():
time.sleep(5)
s.capture(site, site + ".png")

The site variable will contain the newline character for every line in the file. You can try:
site = site.strip()
before calling s.capture:
def capture(self, url, output_file):
self.load(QUrl(url))
self.wait_load()
File_List = open("LiveSite.txt")
for site in File_List.readlines():
time.sleep(5)
site = site.strip()
s.capture(site, site + ".png")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

scrapy - download with correct extension - python

You need to parse Content-Disposition header for the correct file name.

Related

Databricks web scraping

Why does my PyCharm give a warning regarding this scrapy's function signature?

Change twitter banner from url

How to return multiple files in HttpResponse Django

Taking screenshots from multiple urls in a file

Categories

Resources