How can I directly fetch the data from URL to Azure Blob - python

I'm using the requests.get but it is downloading the file on the machine where I executed the file, Is there a way to skip this?
r = requests.get(url,stream=True)
file_name = url.split("/")[-1]
with open(file_name, 'wb') as data:
for chunk in r.iter_content(chunk_size = 1024*1024):
if chunk:
data.write(chunk)
block_blob_service.create_blob_from_path(path.join(container,blob),
data.name,
file_name ,
content_settings=ContentSettings(content_type=mimetypes.guess_type('./%s' %url.split("/")[-1])[0]))

Try to use the code below.
r = requests.get(url,stream=True)
block_blob_service.create_blob_from_stream(container_name, blob_name, io.BytesIO(r.content))
Or
r = requests.get(url,stream=True)
block_blob_service.create_blob_from_bytes(container_name, blob_name, r.content)
Hope it helps.

Related

Name different files from another file having different names respectively

I have created a script to download multiple images. I have another file (linkVars.py) in which there are URLs of the images to download. This script import the linkVars.py file then reads one URL at a time, downloads that image from the URL, and writes it into a file named {file_name}.jpg
Below is the code for the explanation of upper lines:
import linksVars as lV # file with urls
def download_url(url):
# Creating a function
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1 # naming image by using text in url
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
# Opening the image file to write data in it
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
Now, I have multiple names written in name_file.txt(external file). As I download the image, I want to name file_name in {file_name}.jpg from one name in name_file.txt. Then as the code starts to download the next file, the next name in name_file.txt should be assigned to {file_name}.jpg If someone could help me then I will be grateful!
Below is the complete code:
import requests
import linksVars as lV
def download_url(url):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
links = lV.List1
try:
for listLinks in links:
download_url(listLinks)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
Try this:
import requests
import linksVars as lV # Importing file with URLs stored in variables
import nameVars as nV # Importing file with names stored in variables
links = lV.List1 # List1 is the list of URLs stored in variables
names = nV.Name1 # Name1 is the list of names stored in variables
# This function will download image from URL and name it from Name1
def download_url(url, names):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("v=") + 1 # It will find "v=" in given URL and move to next line
name_from_url = url[file_name_start_pos:]
file_name = names
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f: # Downloaded file will opened and named
for data in r:
f.write(data)
try:
for listLinks, listNames in zip (links, names): # "For loop" will use two arguments
download_url(listLinks, listNames)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")

Memory optimization while downloading and saving data to s3 using python

filename ="test.zip"
url = "some url"
data= requests.get(url, stream= True)
f= BytesIO()
f.write(data.content)
try:
s3_r = boto3.resource('s3')
s3_r.Object(bucket,filename).put(body= f.getvalue())
return filename
except:
print('failed')
Note: The zip file is greater than 1GB
my code is working fine but its taking a lot of memory, need to optimize the code that memory usage should not be greater then 200mb
Found below code for Chunk
def download_file(url):
local_filename = url.split('/')[-1]
# NOTE the stream=True parameter below
with requests.get(url, stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
# If you have chunk encoded response uncomment if
# and set chunk_size parameter to None.
#if chunk:
f.write(chunk)
return local_filename
but not able save f.write(chunk) to s3 bucket

Scraping files from google drive - automated queries prevented by

I wanted to scrape a few pdfs from a great history crash course I used to read a long time ago. Sadly, the old website is down and I only managed to get the old html code from archive.org
(the links I got work fine, ex: https://drive.google.com/file/d/0BzRJiIvdbSoKcHpGUWJBUDZ2WDA/edit?usp=sharing).
This script is resulting in html files being downloaded, saying
,,We're sorry but your computer or network may be sending automated queries. To protect our users, we can't process your request right now.”
Is there a way to bypass this? I tried putting a few random delays into the code so this might be insufficient or i might be on google's blacklist for now.
(the text.txt file can be found here https://filebin.net/k2qw09embamx05ey )
import requests
import time
import random
def download_file_from_google_drive(id, destination):
URL = "https://docs.google.com/uc?export=download"
session = requests.Session()
response = session.get(URL, params = { 'id' : id }, stream = True)
token = get_confirm_token(response)
time.sleep(random.randrange(1,2))
if token:
params = { 'id' : id, 'confirm' : token }
response = session.get(URL, params = params, stream = True)
save_response_content(response, destination)
def get_confirm_token(response):
for key, value in response.cookies.items():
if key.startswith('download_warning'):
return value
return None
def save_response_content(response, destination):
CHUNK_SIZE = 32768
with open(destination, "wb") as f:
for chunk in response.iter_content(CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f = open('text.txt')
long_string = f.readlines()
interesting_strings = []
for item in long_string:
if 'drive.google' in item:
interesting_strings.append(item)
print(interesting_strings)
interesting_strings = interesting_strings[0]
interesting_strings = interesting_strings.split('https://web.archive.org/web/20161219093036/')
links = []
for item in interesting_strings:
if 'drive.google' in item:
idx = item.find('"')
links.append(item[:idx])
cntr = 1
for link in links:
print(link)
fname = './data/History_' + str(cntr)
file_id = link.split('/')[-2]
print('id:', file_id)
destination = fname
download_file_from_google_drive(file_id, destination)
print('Getting file #', str(cntr))
cntr += 1
time.sleep(random.randrange(3,15) + random.random())
Use gdown:
import gdown
file_id = '0BzRJiIvdbSoKcHpGUWJBUDZ2WDA'
filename = 'file.pdf'
url = 'https://drive.google.com/uc?id=' + file_id
gdown.download(url, filename, quiet=False)

Download multiple files from S3 django

Here is the link i have used (Download files from Amazon S3 with Django). Using this i'm able to download single file.
Code:
s3_template_path = queryset.values('file')
filename = 'test.pdf'
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
return HttpResponseRedirect(url)
I need to download multiple files from S3, as a zip would be better. Can the mentioned method be modified and used. If not please suggest other method.
Okay here is a possible solution, it basically downloads each file and zips them into a folder, then returns this to the user.
Not sure if s3_template_path is the same for each file, but change this if neccessary
# python 3
import requests
import os
import zipfile
file_names = ['test.pdf', 'test2.pdf', 'test3.pdf']
# set up zip folder
zip_subdir = "download_folder"
zip_filename = zip_subdir + ".zip"
byte_stream = io.BytesIO()
zf = zipfile.ZipFile(byte_stream, "w")
for filename in file_names:
s3_template_path = queryset.values('file')
conn = boto.connect_s3('<aws access key>', '<aws secret key>')
bucket = conn.get_bucket('your_bucket')
s3_file_path = bucket.get_key(s3_template_path)
response_headers = {
'response-content-type': 'application/force-download',
'response-content-disposition':'attachment;filename="%s"'% filename
}
url = s3_file_path.generate_url(60, 'GET',
response_headers=response_headers,
force_http=True)
# download the file
file_response = requests.get(url)
if file_response.status_code == 200:
# create a copy of the file
f1 = open(filename , 'wb')
f1.write(file_response.content)
f1.close()
# write the file to the zip folder
fdir, fname = os.path.split(filename)
zip_path = os.path.join(zip_subdir, fname)
zf.write(filename, zip_path)
# close the zip folder and return
zf.close()
response = HttpResponse(byte_stream.getvalue(), content_type="application/x-zip-compressed")
response['Content-Disposition'] = 'attachment; filename=%s' % zip_filename
return response

How to use `tqdm` in python to show progress when downloading data online?

I can find some doc explaining how to use tqdm package, but from which I can't figure out how to produce progress meter when downloading data online.
Below is an example code I copied from ResidentMario for downloading data
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
return filename
dat = download_file("https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD",
"NYPD Motor Vehicle Collisions.csv")
Could anyone show me how to use tqdm package here to show downloading progress?
Thanks
As of now i do something like that:
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
chunkSize = 1024
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm( unit="B", total=int( r.headers['Content-Length'] ) )
for chunk in r.iter_content(chunk_size=chunkSize):
if chunk: # filter out keep-alive new chunks
pbar.update (len(chunk))
f.write(chunk)
return filename
pbar.clear() and pbar.close()
Manually update the progress bar, useful for streams such as reading files.
https://github.com/tqdm/tqdm#returns
def download_file(url, filename):
"""
Helper method handling downloading large files from `url` to `filename`. Returns a pointer to `filename`.
"""
r = requests.get(url, stream=True)
with open(filename, 'wb') as f:
pbar = tqdm(unit="B", unit_scale=True, unit_divisor=1024, total=int( r.headers['Content-Length'] ))
pbar.clear() # clear 0% info
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update(len(chunk))
f.write(chunk)
pbar.close()
return filename
Thanks to silmaril, but the below works and makes more sense to me.
def download_file(url, filename):
r = requests.get(url, stream=True)
filelength = int(r.headers['Content-Length'])
with open(filename, 'wb') as f:
pbar = tqdm(total=int(filelength/1024))
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
pbar.update ()
f.write(chunk)

Categories