bulk download using python with requests - python

I've been trying to download all the files on this page (https://apps.fs.usda.gov/fia/datamart/datamart_excel.html) in bulk , but am having some issues.
All the filenames are the '{state abbreviations}.xlsm', so I can download a single file using requests using code like this:
import requests
url = 'https://apps.fs.usda.gov/fia/datamart/Workbooks/WA.xlsm'
r = requests.get(url)
with open('WA.xlsm', 'wb') as f:
f.write(r.content)
I believe there should be a way to incorporate this into a for loop to get all of the files, but I'm at a loss. Any advice?
Thanks!

Try the below
import requests
states = ['WA','CA'] # TODO add more states
for state in states:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
with open(f'{state}.xlsm', 'wb') as f:
f.write(r.content)

Just to add on to #balderman asnwer, but if you have multiple states to get, might be slightly more efficient to use a threading approach. straightforward example using concurrent.futures:
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from time import time
import requests
states = ['WA', 'CA', 'VA', 'NC'] # TODO add more states
out_dir = Path('temp_files')
out_dir.mkdir(exist_ok=True)
def get_content(state: str) -> bytes:
url = f'https://apps.fs.usda.gov/fia/datamart/Workbooks/{state}.xlsm'
r = requests.get(url)
return r.content
start = time()
with ThreadPoolExecutor(max_workers=max(10, len(states))) as pool:
for state, content in zip(states, pool.map(get_content, states)):
with open(out_dir / f'{state}.xlsm', 'wb') as f:
f.write(content)
print('Download ThreadExecutor took', time()-start)
# Compare times with below
# start = time()
# for state in states:
# b = get_content(state)
# with open(out_dir / f'{state}.xlsm', 'wb') as f:
# f.write(b)
# print('Download took', time()-start)

Related

Using TQDM almost doubles the file size of my GET request

I've been writing a code to download GRIB (weather) file of of the internet for future use. Right now, I'm only a the stage of downloading and writing in the right folder but for some reason when I ue TQDM for a progress bar, the file size almost doubles. Without the progress the file size is fine.
With the following code I get a 2.3MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
If I use TQDM for a progress bar like so, I get a 4.5MB file.
import datetime
fsearch = datetime.date.today().strftime('%Y%m%d00')
fname = datetime.date.today().strftime('%d-%m-%Y')
def sfc_pres():
id = fsearch
url = 'https://dd.weather.gc.ca/ensemble/geps/grib2/raw/00/000/CMC_geps-raw_PRES_SFC_0_latlon0p5x0p5_{0}_P000_allmbrs.grib2'.format(id)
r = requests.get(url, allow_redirects=True)
stat=r.status_code
while stat:
if stat==200:
print('Fichier trouvé, téléchargement')
elif stat==404:
print('Fichier introuvable')
break
from tqdm import tqdm
total_size_in_bytes= int(r.headers.get('content-length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
id = fname
with open(r'C:\Users\JM\Desktop\GRIB\Pression de surface 00UTC {0}.grib2'.format(id) , 'wb') as f:
f.write(r.content)
for data in r.iter_content(block_size):
progress_bar.update(len(data))
f.write(data)
progress_bar.close()
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
print("Échec du téléchargement")
My troubleshooting got me to know it was within the TQDM code but I can't find why...
If you're using r.iter_content you shouldn't also call f.write(r.content) - then you're writing the data twice (and lose the streaming behavior you're trying to get).

How do I download videos from Pexels API?

I have this code that can pull images off of Pexels, but I don't know how to change it to video. I haven't seen anyone do this before and any help greatly appreciated. I tried switching all the photo tags to videos but that seemed not to work. I've also tried adding more libraries but that doesn't seem to work either.
import argparse
import json
import os
import time
import requests
import tqdm
from pexels_api import API
PEXELS_API_KEY = os.environ['PEXELS_KEY']
MAX_IMAGES_PER_QUERY = 100
RESULTS_PER_PAGE = 10
PAGE_LIMIT = MAX_IMAGES_PER_QUERY / RESULTS_PER_PAGE
def get_sleep(t):
def sleep():
time.sleep(t)
return sleep
def main(args):
sleep = get_sleep(args.sleep)
api = API(PEXELS_API_KEY)
query = args.query
page = 1
counter = 0
photos_dict = {}
# Step 1: Getting urls and meta information
while page <= PAGE_LIMIT:
api.search(query, page=page, results_per_page=RESULTS_PER_PAGE)
photos = api.get_entries()
for photo in tqdm.tqdm(photos):
photos_dict[photo.id] = vars(photo)['_Photo__photo']
counter += 1
if not api.has_next_page:
break
page += 1
sleep()
print(f"Finishing at page: {page}")
print(f"Images were processed: {counter}")
# Step 2: Downloading
if photos_dict:
os.makedirs(args.path, exist_ok=True)
# Saving dict
with open(os.path.join(args.path, f'{query}.json'), 'w') as fout:
json.dump(photos_dict, fout)
for val in tqdm.tqdm(photos_dict.values()):
url = val['src'][args.resolution]
fname = os.path.basename(val['src']['original'])
image_path = os.path.join(args.path, fname)
if not os.path.isfile(image_path): # ignore if already downloaded
response = requests.get(url, stream=True)
with open(image_path, 'wb') as outfile:
outfile.write(response.content)
else:
print(f"File exists: {image_path}")
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--query', type=str, required=True)
parser.add_argument('--path', type=str, default='./results_pexels')
parser.add_argument('--resolution', choices=['original', 'large2x', 'large',
'medium', 'small', 'portrait',
'landscape', 'tiny'], default='original')
parser.add_argument('--sleep', type=float, default=0.1)
args = parser.parse_args()
main(args)
sorry for bumping into the question. I just faced a similar situation when downloading the videos from Pexels using the python API, pexelsPy. This may be helpful:
I retrieved the ID of the videos and then created the downloading URL that has the following structure: "https://www.pexels.com/video/"+ ID +"/download".
See the following example:
def download_video(type_of_videos):
video_tag = random.choice(type_of_videos)
PEXELS_API = '-' #please add your API Key here
api = API(PEXELS_API)
retrieved_videos = read_already_download_files('downloaded_files.txt')
video_found_flag = True
num_page = 1
while video_found_flag:
api.search_videos(video_tag, page=num_page, results_per_page=10)
videos = api.get_videos()
for data in videos:
if data.width > data.height: #look for horizontal orientation videos
if data.url not in retrieved_videos:
# write_file('downloaded_files.txt', data.url)
url_video = 'https://www.pexels.com/video/' + str(data.id) + '/download' #create the url with the video id
r = requests.get(url_video)
with open(data.url.split('/')[-2]+'.mp4', 'wb') as outfile:
outfile.write(r.content)
return data.url.split('/')[-2]+'.mp4' #download the video
num_page += 1
download_video function takes an array of strings with several tags, e.g.: ['happy','sad','relax']. Then it randomly chooses one of these tags.
PEXELS_API should contain your API Key.
read_already_download_files('downloaded_files.txt'): Retrieves already downloaded files to check if the current found file is already downloaded.
from pypexels import PyPexels
import requests
api_key = 'api id'
# instantiate PyPexels object
py_pexel = PyPexels(api_key=api_key)
search_videos_page = py_pexel.videos_search(query="love", per_page=40)
# while True:
for video in search_videos_page.entries:
print(video.id, video.user.get('name'), video.url)
data_url = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(data_url)
print(r.headers.get('content-type'))
with open('sample.mp4', 'wb') as outfile:
outfile.write(r.content)
# if not search_videos_page.has_next:
break
# search_videos_page = search_videos_page.get_next_page()
I just tried to do the same. When I was looking for it, I wanted a simple example. All other fancy stuff I was sure I could add myself. So, I built upon inou's answer. The shown example is very basic and requests one page with only 5 results using the 'Tiger' tag in the search query. I download the first video using its id provided by the response and simply write it to the source folder. The api is provided by pexelsPy and the request is executed using the standard requests package. To get access to the API, you need to create a key on pexels website (see here). Once you get your own API key, you should be able to simply substitute the shown example key and run the code as a test.
import pexelsPy
import requests
PEXELS_API = '16gv62567257256iu78krtuzwqsddudrtjberzabzwzjsrtgswnr'
api = pexelsPy.API(PEXELS_API)
api.search_videos('Tiger', page=1, results_per_page=5)
videos = api.get_videos()
url_video = 'https://www.pexels.com/video/' + str(videos[0].id) + '/download'
r = requests.get(url_video)
with open('test.mp4', 'wb') as outfile:
outfile.write(r.content)
You can download multiple videos with this code :
import pexelsPy
import requests
PEXELS_API = '-'
api = pexelsPy.API(PEXELS_API)
api.search_videos('nature', page=2, results_per_page=100, orientation='landscape')
videos = api.get_videos()
for i, video in enumerate(videos):
url_video = 'https://www.pexels.com/video/' + str(video.id) + '/download'
r = requests.get(url_video)
with open(f'test_{i}.mp4', 'wb') as outfile:
outfile.write(r.content)
This will download 100 videos, with each video being written to a separate file named test_0.mp4, test_1.mp4, ..., test_99.mp4.

multiprocessing slower than loop

I'm trying to write huge data to a csv file. When I try normal method it writes 50 data in 1 second but with multiprocessing it's down to 5 data in 1 second.
And I also added this code sys.setrecursionlimit(25000). Because without it's giving error.
I can feel I'm not doing right. What is the right way?
from bs4 import BeautifulSoup
import requests
import lxml
import csv
import cchardet
from multiprocessing import Pool
import sys
import time
sys.setrecursionlimit(25000)
csvfileWrite=open("comments.csv", 'a+', newline='',encoding='utf-8') #declared as a global variable
writer = csv.writer(csvfileWrite, delimiter=';', quotechar='"',
quoting=csv.QUOTE_MINIMAL) #declared as a global variable
def kacYildiz(div): #This function returns a number 0 to 5. Not important.
yildizSayisi=0
yildizYeri=div.find("div",attrs={"class":"RatingPointer-module-1OKF3"})
yildizlar=yildizYeri.find_all("svg")
for yildiz in yildizlar:
sonuc=yildiz.find("path").get("fill")
if(sonuc=="#f28b00"):
yildizSayisi+=1
return yildizSayisi
def takeText(div):
comment=div.find("span",attrs={"itemprop":"description"}).text
return comment
def yorumSayfaSayisi(row): # This function returns a number that how many
pages in the sites comment section. Not important.
yorumKismi="-yorumlari?"
adres=row[0]+yorumKismi
r = requests_session.get(adres)
soup = BeautifulSoup(r.text,"lxml")
sayfaS=soup.find("ul",attrs={"class":"PaginationBar-module-3qhrm"})
sayi=sayfaS.find_all("li")[-1].text
return sayi
def writeToCsv(comments): #writing commets to csv file.
global csvfileWrite
global writer
textToWrite = takeText(comments)
writer.writerow([kacYildiz(comments),textToWrite])
if __name__ == '__main__':
pageNumber=1
requests_session = requests.Session()
comments=list()
csvfile=open('adresler.csv',newline='')
reader = csv.reader(csvfile, delimiter=';', quotechar='|')
for row in reader:
rowNumber=yorumSayfaSayisi(row)
for i in range(1,int(rowNumber)):
comments.clear()
commetAdress="-yorumlari?sayfa={}".format(i)
adress=row[0]+commetAdress
r = requests_session.get(adress)
soup = BeautifulSoup(r.text,"lxml")
page=soup.find_all("div",attrs={"class":"ReviewCard-module-
3Y36S"})
for comment in page:
comments.append(comment)
p = Pool(10)
start = time.process_time()
p.map(writeToCsv, comments)
p.terminate()
p.join()
once try this approach using ThreadPool
from multiprocessing.pool import ThreadPool
def csvYaz(yorumlar):
global csvfileYaz
global yazici
yazi = yorumAl(yorumlar)
yazici.writerow([kacYildiz(yorumlar),yazi])
# ------main-----
for yorum in yorumSayfasi:
yorumlar.append(yorum)
threads = ThreadPool(10).map(csvYaz, yorumlar)
for zz in threads:
print(zz)

How to download multiple CSV files off a website using Python?

I am a beginner at programming (Finance professional) & I am looking to cut manual work using Python. I want to download multiple CSVs (Daily Volatility CSVs of past one year) from https://www.nseindia.com/products/content/equities/equities/archieve_eq.htm
So far, I am able to download one file at a time. But I am not able to apply for loop to download past one year's CSVs. Also, it would help if I can skip downloading CSVs from Saturdays and Sundays.
I made a csv file where links to all required CSV files are mentioned. Then tried to import that csv file and run a for loop operation on it. But I don't know enough programming to do that.
import requests
import shutil
r = requests.get('https://nseindia.com/archives/nsccl/volt/CMVOLT_01072018.CSV', stream=True)
if r.status_code == 200:
with open("01072018.csv", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
Desired results: Download CSV files based on a date range input.
Actual results: Downloading 1 CSV file at a time.
filenames=['https://nseindia.com/archives/nsccl/volt/CMVOLT_01072018.CSV',
'https://nseindia.com/archives/nsccl/volt/CMVOLT_01082018.CSV',
'https://nseindia.com/archives/nsccl/volt/CMVOLT_01092018.CSV',
]
for x in filenames:
r=requests.get(x, stream=True)
if r.status_code == 200:
with open(x.split('_')[-1], 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
Alright without adding another library, the following is the code which should work even though it didn't work on my machine which has some restrictions.
import datetime as timer
import requests
import shutil
def download_data(date):
url='https://nseindia.com/archives/nsccl/volt/CMVOLT_'+date+'.CSV'
csv_filename=date+'.csv'
try:
print('Calling url:- ' + url)
r = requests.get(url, stream=True,verify=False)
if r.status_code == 200:
with open(csv_filename, 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
r.close()
except Exception as e:
print('for Date '+ date +' Exception happened, most probably a weekend, EXCEPTION Message is ' + str(e))
def code_runner():
i=0
now = timer.datetime.now()
day = now.day
month = now.month
year = now.year
while i<365:
day=day-1
if day==0:
day=31
month=month-1
if month==0:
month=12
year=year-1
year1=year
month1='{:02d}'.format(month)
day1='{:02d}'.format(day)
date=str(day1)+str(month1)+str(year1)
download_data(date)
i+=1
if __name__=='__main__':
code_runner()
I would add a date loop for your script:
#!/usr/bin/env ipython
# --------------------
import requests
import shutil
import datetime
# -----------------------------------------------------------------------------------
dates=[datetime.datetime(2019,1,1)+datetime.timedelta(dval) for dval in range(0,366)];
# -----------------------------------------------------------------------------------
for dateval in dates:
r = requests.get('https://www.nseindia.com/archives/nsccl/volt/CMVOLT_'+dateval.strftime('%d%m%Y')+'.CSV', stream=True)
if r.status_code == 200:
with open(dateval.strftime('%d%m%Y')+".csv", 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw, f)
# ---------------------------------------------------------------------------------

Check response using urllib2

I am trying access a page by incrementing the page counter using opencorporates api. But the problem is there are times when useless data is there. For example in the below url for jurisdiction_code = ae_az I get webpage showing just this:
{"api_version":"0.2","results":{"companies":[],"page":1,"per_page":26,"total_pages":0,"total_count":0}}
which is technically empty. How to check for such data and skip over this to move on to next jurisdiction?
This is my code
import urllib2
import json,os
f = open('codes','r')
for line in f.readlines():
id = line.strip('\n')
url = 'http://api.opencorporates.com/v0.2/companies/search?q=&jurisdiction_code={0}&per_page=26&current_status=Active&page={1}?api_token=ab123cd45'
i = 0
directory = id
os.makedirs(directory)
while True:
i += 1
req = urllib2.Request(url.format(id, i))
print url.format(id,i)
try:
response = urllib2.urlopen(url.format(id, i))
except urllib2.HTTPError, e:
break
content = response.read()
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
Interpret the response you get back (you already know it's json) and check if the data you want is there.
...
content = response.read()
data = json.loads(content)
if not data.get('results', {}).get('companies'):
break
...
Here's your code written with Requests and using the answer here. It is nowhere near as robust or clean as it should be, but demonstrates the path you might want to take. The rate limit is a guess, and doesn't seem to work. Remember to put your actual API key in.
import json
import os
from time import sleep
import requests
url = 'http://api.opencorporates.com/v0.2/companies/search'
token = 'ab123cd45'
rate = 20 # seconds to wait after rate limited
with open('codes') as f:
codes = [l.strip('\n') for l in f]
def get_page(code, page, **kwargs):
params = {
# 'api_token': token,
'jurisdiction_code': code,
'page': page,
}
params.update(kwargs)
while True:
r = requests.get(url, params=params)
try:
data = r.json()
except ValueError:
return None
if 'error' in data:
print data['error']['message']
sleep(rate)
continue
return data['results']
def dump_page(code, page, data):
with open(os.path.join(code, str(page) + '.json'), 'w') as f:
json.dump(data, f)
for code in codes:
try:
os.makedirs(code)
except os.error:
pass
data = get_page(code, 1)
if data is None:
continue
dump_page(code, 1, data['companies'])
for page in xrange(1, int(data.get('total_pages', 1))):
data = get_page(code, page)
if data is None:
break
dump_page(code, page, data['companies'])
I think that actually this example is not "technically empty." It contains data and is therefore technically not empty. The data just does not include any fields that are useful to you. :-)
If you want your code to skip over responses that have uninteresting data, then just check whether the JSON has the necessary fields before writing any data:
content = response.read()
try:
json_content = json.loads(content)
if json_content['results']['total_count'] > 0:
fo = str(i) + '.json'
OUTFILE = os.path.join(directory, fo)
with open(OUTFILE, 'w') as f:
f.write(content)
except KeyError:
break
except ValueError:
break
etc. You might want to report the ValueError or the KeyError, but that's up to you.

Categories