I have created a script to download multiple images. I have another file (linkVars.py) in which there are URLs of the images to download. This script import the linkVars.py file then reads one URL at a time, downloads that image from the URL, and writes it into a file named {file_name}.jpg
Below is the code for the explanation of upper lines:
import linksVars as lV # file with urls
def download_url(url):
# Creating a function
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1 # naming image by using text in url
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
# Opening the image file to write data in it
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
Now, I have multiple names written in name_file.txt(external file). As I download the image, I want to name file_name in {file_name}.jpg from one name in name_file.txt. Then as the code starts to download the next file, the next name in name_file.txt should be assigned to {file_name}.jpg If someone could help me then I will be grateful!
Below is the complete code:
import requests
import linksVars as lV
def download_url(url):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
links = lV.List1
try:
for listLinks in links:
download_url(listLinks)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
Try this:
import requests
import linksVars as lV # Importing file with URLs stored in variables
import nameVars as nV # Importing file with names stored in variables
links = lV.List1 # List1 is the list of URLs stored in variables
names = nV.Name1 # Name1 is the list of names stored in variables
# This function will download image from URL and name it from Name1
def download_url(url, names):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("v=") + 1 # It will find "v=" in given URL and move to next line
name_from_url = url[file_name_start_pos:]
file_name = names
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f: # Downloaded file will opened and named
for data in r:
f.write(data)
try:
for listLinks, listNames in zip (links, names): # "For loop" will use two arguments
download_url(listLinks, listNames)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
Related
I am trying to download files using python and then add lines at the end of the downloaded files, but it returns an error:
f.write(data + """<auth-user-pass>
TypeError: can't concat str to bytes
Edit: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
I also tried something like this but it also did not work: f.write(str(data) + "< auth-user-pass >")
here is my full code:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
for r in results:
print(r)
EDIT: Thanks, it works now when I do this b"""< auth-user-pass >""", but I only want to add the string at the end of the file. When I run the code, it adds the string for every line.
Try this:
import requests
from multiprocessing.pool import ThreadPool
def download_url(url):
print("downloading: ", url)
# assumes that the last segment after the / represents the file name
# if url is abc/xyz/file.txt, the file name will be file.txt
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
save_path = 'ovpns/'
complete_path = os.path.join(save_path, file_name)
print(complete_path)
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(complete_path, 'wb') as f:
for data in r:
f.write(data)
return url
servers = [
"us-ca72.nordvpn.com",
"us-ca73.nordvpn.com"
]
urls = []
for server in servers:
urls.append("https://downloads.nordcdn.com/configs/files/ovpn_legacy/servers/" + server + ".udp1194.ovpn")
# Run 5 multiple threads. Each call will take the next element in urls list
results = ThreadPool(5).imap_unordered(download_url, urls)
with open(complete_path, 'ab') as f:
f.write(b"""<auth-user-pass>
username
password
</auth-user-pass>""")
for r in results:
print(r)
You are using binary mode, encode your string before concat, that is replace
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""")
using
for data in r:
f.write(data + """<auth-user-pass>
username
password
</auth-user-pass>""".encode())
You open the file as a write in binary.
Because of that you cant use normal strings like the comment from #user56700 said.
You either need to convert the string or open it another way(ex. 'a' = appending).
Im not completly sure but it is also possible that the write binary variant of open the data of the file deletes. Normally open with write deletes existing data, so its quite possible that you need to change it to 'rwb'.
I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).
There is an algorithm in the end of the text. It reads lines from the file SP500.txt. File contains strings and it looks like:
AAA
BBB
CCC
Substitutes these strings in the get request and saves the entire url to a file url_requests.txt. For the example:
https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d
https://apidate.com/api/api/BBB.US?api_token=XXXXXXXX&period=d
https://apidate.com/api/api/CCC.US?api_token=XXXXXXXX&period=d
and then processes each request via the API and adds all responses to get requests to responses.txt.
I don't know how to save the response from each request from the file url_requests.txt into separate csv file instead of responses.txt (now they are all written to this file, and not separately). In this case, it is important to name each file with the corresponding line from the file SP500.txt. For example:
AAA.csv `(which contains data from the request response https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d)`
BBB.csv `(which contains data from the request response https://apidate.com/api/api/BBB.US?api_token=XXXXXXXX&period=d)`
CCC.csv `(which contains data from the request response https://apidate.com/api/api/CCC.US?api_token=XXXXXXXX&period=d)`
So, algorithm is:
import requests
# to use strip to remove spaces in textfiles.
import sys
# two variables to squeeze a string between these two so it will become a full uri
part1 = 'https://apidate.com/api/api/'
part2 = '.US?api_token=XXXXXXXX&period=d'
# open the outputfile before the for loop
text_file = open("url_requests.txt", "w")
# open the file which contains the strings
with open('SP500.txt', 'r') as f:
for i in f:
uri = part1 + i.strip(' \n\t') + part2
print(uri)
text_file.write(uri)
text_file.write("\n")
text_file.close()
# open a new file textfile for saving the responses from the api
text_file = open("responses.txt", "w")
# send every uri to the api and write the respones to a textfile
with open('url_requests.txt', 'r') as f2:
for i in f2:
uri = i.strip(' \n\t')
batch = requests.get(i)
data = batch.text
print(data)
text_file.write(data)
text_file.write('\n')
text_file.close()
And I know how to save csv from this response. It is like:
import csv
import requests
url = "https://apidate.com/api/api/AAA.US?api_token=XXXXXXXX&period=d"
response = requests.get(url)
with open('out.csv', 'w') as f:
writer = csv.writer(f)
for line in response.iter_lines():
writer.writerow(line.decode('utf-8').split(','))
To save in different names you have to use open() and write() inside for-loop when you read data.
It would good to read all names to list and later generate urls and also keep on list so you would not have to read them.
When I see code which you use to save csv then it looks like you get csv from server so you could save all at once using open() write() without csv module.
I see it in this way.
import requests
#import csv
# --- read names ---
all_names = [] # to keep all names in memory
with open('SP500.txt', 'r') as text_file:
for line in text_file:
line = line.strip()
print('name:', name)
all_names.append(line)
# ---- generate urls ---
url_template = 'https://apidate.com/api/api/{}.US?api_token=XXXXXXXX&period=d'
all_uls = [] # to keep all urls in memory
with open("url_requests.txt", "w") as text_file:
for name in all_names:
url = url_template.format(name)
print('url:', url)
all_uls.append(url)
text_file.write(url + "\n")
# --- read data ---
for name, url in zip(all_names, all_urls):
#print('name:', name)
#print('url:', url)
response = requests.get(url)
with open(name + '.csv', 'w') as text_file:
text_file.write(response.text)
#writer = csv.writer(text_file)
#for line in response.iter_lines():
# writer.writerow(line.decode('utf-8').split(',')
You could calculate a filename for every string i, and open (create) a file each time.
Something like this:
import sys
import requests
# two variables to squeeze a string between these two so it will become a full uri
part1 = 'https://apidate.com/api/api/'
part2 = '.US?api_token=XXXXXXXX&period=d'
# open the outputfile before the for loop
text_file = open("url_requests.txt", "w")
uri_dict = {}
with open('SP500.txt', 'r') as f:
for i in f:
uri = part1 + i.strip(' \n\t') + part2
print(uri)
text_file.write(uri)
text_file.write("\n")
uri_dict[i] = uri
text_file.close()
for symbol, uri in uri_dict:
batch = requests.get(uri)
data = batch.text
print(data)
#create the filename
filename = symbol+".csv"
#open (create) the file and save the data
with open(filename, "w") as f:
f.write(data)
f.write('\n')
You could also get rid of url_requests.csv, which becomes useless (until you have other uses for it).
I am trying to download few videos from list of links.
Every line in text file is one link.
When I try to download all videos in loop, only first one is working.
Videos are from 60 - 100 MB.
Loop continues afterwards, but files are empty.
Thank you for help.
def download():
name = 'video'
a = 1
with open('download.txt') as f:
lines = f.readlines()
for line in lines:
url = line
response = requests.get(url, stream=True)
name = name + str(a)
filename = name + '.mp4'
with open(filename, 'wb') as f:
f.write(response.content)
a = a + 1
def download():
name = 'video'
a = 1
with open('download.txt') as f:
lines = f.readlines()
for line in lines:
urllib.request.urlretrieve(line.strip(), name + str(a) + ".mpg")
a += 1
This code worked for me.
Depending on your purpose, you may want to account for security, robustness (what happens if one download fails?), performance (concurrency?).
I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26¤t_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.