How to download multiple CSV files off a website using Python? - python

I am a beginner at programming (Finance professional) & I am looking to cut manual work using Python. I want to download multiple CSVs (Daily Volatility CSVs of past one year) from https://www.nseindia.com/products/content/equities/equities/archieve_eq.htm
So far, I am able to download one file at a time. But I am not able to apply for loop to download past one year's CSVs. Also, it would help if I can skip downloading CSVs from Saturdays and Sundays.
I made a csv file where links to all required CSV files are mentioned. Then tried to import that csv file and run a for loop operation on it. But I don't know enough programming to do that.
import requests
import shutil
r = requests.get('https://nseindia.com/archives/nsccl/volt/CMVOLT_01072018.CSV', stream=True)
if r.status_code == 200:
with open("01072018.csv", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
Desired results: Download CSV files based on a date range input.
Actual results: Downloading 1 CSV file at a time.

filenames=['https://nseindia.com/archives/nsccl/volt/CMVOLT_01072018.CSV',
'https://nseindia.com/archives/nsccl/volt/CMVOLT_01082018.CSV',
'https://nseindia.com/archives/nsccl/volt/CMVOLT_01092018.CSV',
]
for x in filenames:
r=requests.get(x, stream=True)
if r.status_code == 200:
with open(x.split('_')[-1], 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)

Alright without adding another library, the following is the code which should work even though it didn't work on my machine which has some restrictions.
import datetime as timer
import requests
import shutil
def download_data(date):
url='https://nseindia.com/archives/nsccl/volt/CMVOLT_'+date+'.CSV'
csv_filename=date+'.csv'
try:
print('Calling url:- ' + url)
r = requests.get(url, stream=True,verify=False)
if r.status_code == 200:
with open(csv_filename, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
r.close()
except Exception as e:
print('for Date '+ date +' Exception happened, most probably a weekend, EXCEPTION Message is ' + str(e))
def code_runner():
i=0
now = timer.datetime.now()
day = now.day
month = now.month
year = now.year
while i<365:
day=day-1
if day==0:
day=31
month=month-1
if month==0:
month=12
year=year-1
year1=year
month1='{:02d}'.format(month)
day1='{:02d}'.format(day)
date=str(day1)+str(month1)+str(year1)
download_data(date)
i+=1
if __name__=='__main__':
code_runner()

I would add a date loop for your script:
#!/usr/bin/env ipython
# --------------------
import requests
import shutil
import datetime
# -----------------------------------------------------------------------------------
dates=[datetime.datetime(2019,1,1)+datetime.timedelta(dval) for dval in range(0,366)];
# -----------------------------------------------------------------------------------
for dateval in dates:
r = requests.get('https://www.nseindia.com/archives/nsccl/volt/CMVOLT_'+dateval.strftime('%d%m%Y')+'.CSV', stream=True)
if r.status_code == 200:
with open(dateval.strftime('%d%m%Y')+".csv", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
# ---------------------------------------------------------------------------------

Related

Name different files from another file having different names respectively

I have created a script to download multiple images. I have another file (linkVars.py) in which there are URLs of the images to download. This script import the linkVars.py file then reads one URL at a time, downloads that image from the URL, and writes it into a file named {file_name}.jpg
Below is the code for the explanation of upper lines:
import linksVars as lV # file with urls
def download_url(url):
# Creating a function
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1 # naming image by using text in url
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
# Opening the image file to write data in it
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
Now, I have multiple names written in name_file.txt(external file). As I download the image, I want to name file_name in {file_name}.jpg from one name in name_file.txt. Then as the code starts to download the next file, the next name in name_file.txt should be assigned to {file_name}.jpg If someone could help me then I will be grateful!
Below is the complete code:
import requests
import linksVars as lV
def download_url(url):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("=") + 1
name_from_url = url[file_name_start_pos:]
file_name = name_from_url
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f:
for data in r:
f.write(data)
links = lV.List1
try:
for listLinks in links:
download_url(listLinks)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")
Try this:
import requests
import linksVars as lV # Importing file with URLs stored in variables
import nameVars as nV # Importing file with names stored in variables
links = lV.List1 # List1 is the list of URLs stored in variables
names = nV.Name1 # Name1 is the list of names stored in variables
# This function will download image from URL and name it from Name1
def download_url(url, names):
print(f"\nDownloading from: ", url)
file_name_start_pos = url.rfind("v=") + 1 # It will find "v=" in given URL and move to next line
name_from_url = url[file_name_start_pos:]
file_name = names
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(f'{file_name}.jpg', 'wb') as f: # Downloaded file will opened and named
for data in r:
f.write(data)
try:
for listLinks, listNames in zip (links, names): # "For loop" will use two arguments
download_url(listLinks, listNames)
except(KeyboardInterrupt):
print("\n\n===> Script ended by USER! <===")

bulk download using python with requests

I've been trying to download all the files on this page (https://apps.fs.usda.gov/fia/datamart/datamart_excel.html) in bulk , but am having some issues.
All the filenames are the '{state abbreviations}.xlsm', so I can download a single file using requests using code like this:
import requests
url = 'https://apps.fs.usda.gov/fia/datamart/Workbooks/WA.xlsm'
r = requests.get(url)
with open('WA.xlsm', 'wb') as f:
f.write(r.content)
I believe there should be a way to incorporate this into a for loop to get all of the files, but I'm at a loss. Any advice?
Thanks!
Try the below
import requests
states = ['WA','CA'] # TODO add more states
for state in states:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
with open(f'{state}.xlsm', 'wb') as f:
f.write(r.content)
Just to add on to #balderman asnwer, but if you have multiple states to get, might be slightly more efficient to use a threading approach. straightforward example using concurrent.futures:
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from time import time
import requests
states = ['WA', 'CA', 'VA', 'NC'] # TODO add more states
out_dir = Path('temp_files')
out_dir.mkdir(exist_ok=True)
def get_content(state: str) -> bytes:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
return r.content
start = time()
with ThreadPoolExecutor(max_workers=max(10, len(states))) as pool:
for state, content in zip(states, pool.map(get_content, states)):
with open(out_dir / f'{state}.xlsm', 'wb') as f:
f.write(content)
print('Download ThreadExecutor took', time()-start)
# Compare times with below
# start = time()
# for state in states:
# b = get_content(state)
# with open(out_dir / f'{state}.xlsm', 'wb') as f:
# f.write(b)
# print('Download took', time()-start)

Using TQDM almost doubles the file size of my GET request

I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).

Get python code to persist after IndexError

I am querying an API from a website. The API will be down for maintenance from time to time and also, there may not be data available for querying at times. I have written the code to keep forcing the program to query the API even after an error, however it doesn't seem to be working.
The following is the code:
import threading
import json
import urllib
from urllib.parse import urlparse
import httplib2 as http #External library
import datetime
import pyodbc as db
import os
import gzip
import csv
import shutil
def task():
#Authentication parameters
headers = { 'AccountKey' : 'secret',
'accept' : 'application/json'} #this is by default
#API parameters
uri = 'http://somewebsite.com/' #Resource URL
path = '/something/TrafficIncidents?'
#Build query string & specify type of API call
target = urlparse(uri + path)
print(target.geturl())
method = 'GET'
body = ''
#Get handle to http
h = http.Http()
#Obtain results
response, content = h.request(target.geturl(), method, body, headers)
api_call_time = datetime.datetime.now()
filename = "traffic_incidents_" + str(datetime.datetime.today().strftime('%Y-%m-%d'))
createHeader = 1
if os.path.exists(filename + '.csv'):
csvFile = open(filename + '.csv', 'a')
createHeader = 0
else:
#compress previous day's file
prev_filename = "traffic_incidents_" + (datetime.datetime.today()-datetime.timedelta(days=1)).strftime('%Y-%m-%d')
if os.path.exists(prev_filename + '.csv'):
with open(prev_filename + '.csv' , 'rb') as f_in, gzip.open(prev_filename + '.csv.gz', 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(prev_filename + '.csv')
#create new csv file for writing
csvFile = open(filename + '.csv', 'w')
#Parse JSON to print
jsonObj = json.loads(content)
print (json.dumps(jsonObj, sort_keys=True, indent=4))
with open("traffic_incidents.json","w") as outfile:
#Saving jsonObj["d"]
json.dump(jsonObj, outfile, sort_keys=True, indent=4,ensure_ascii=False)
for i in range(len(jsonObj["value"])):
jsonObj["value"][i]["IncidentTime"] = jsonObj["value"][i]["Message"].split(' ',1)[0]
jsonObj["value"][i]["Message"] = jsonObj["value"][i]["Message"].split(' ',1)[1]
jsonObj["value"][i]["ApiCallTime"] = api_call_time
#Save to csv file
header = jsonObj["value"][0].keys()
csvwriter = csv.writer(csvFile,lineterminator='\n')
if createHeader == 1:
csvwriter.writerow(header)
for i in range(len(jsonObj["value"])):
csvwriter.writerow(jsonObj["value"][i].values())
csvFile.close()
t = threading.Timer(120,task)
t.start()
while True:
try:
task()
except IndexError:
pass
else:
break
I get the following error and the program stops:
"header = jsonObj["value"][0].keys()
IndexError: list index out of range"
I would like the program to keep running even after the IndexError has occured.
How can I edit the code to achieve that?

Check response using urllib2

I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26&current_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.

Categories