Downloading csv data from an API - python

I am attempting to download csv data from an API which I will then edit I am struggling to get the different functions to work together.
i.e. passing the export link through to download the file and then through to opening it.
'''
File name: downloadAWR.py
Author: Harry&Joe
Date created: 3/10/17
Date last modified: 5/10/17
Version: 3.6
'''
import requests
import json
import urllib2
import zipfile
import io
import csv
import os
from urllib2 import urlopen, URLError, HTTPError
geturl() is used to create a download link for the csv data, one link will be created with user input data in this case the name and dates, this will then create a link that we can use to download the data. the link is stored in export_link
def geturl():
#getProjectName
project_name = 'BIMM'
#getApiToken
api_token = "API KEY HERE"
#getStartDate
start_date = '2017-01-01'
#getStopDate
stop_date = '2017-09-01'
url = "https://api.awrcloud.com/get.php?action=export_ranking&project=%s&token=%s&startDate=%s&stopDate=%s" % (project_name,api_token,start_date,stop_date)
export_link = requests.get(url).content
return export_link
dlfile is used to actually use the link a get a file we can manipulate and edit e.g. removing columns and some of the data.
def dlfile(export_link):
# Open the url
try:
f = urlopen(export_link)
print ("downloading " + export_link)
# Open our local file for writing
with open(os.path.basename(export_link), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError as e:
print ("HTTP Error:", e.code, export_link)
except URLError as e:
print ("URL Error:", e.reason, export_link)
return f
readdata is used to go into the file and open it for us to use.
def readdata():
with zipfile.ZipFile(io.BytesIO(zipdata)) as z:
for f in z.filelist:
csvdata = z.read(f)
#reader = csv.reader(io.StringIO(csvdata.decode()))
def main():
#Do something with the csv data
export_link = (geturl())
data = dlfile(export_link)
csvdata = data.readdata()
if __name__ == '__main__':
main()
Generally I'm finding that the code works independently but struggles when I try to put it all together synchronously.

You need to clean up and call your code appropriately. It seems you copy pasted from different sources and now you have some salad bowl of code that isn't mixing well.
If the task is just to read and open a remote file to do something to it:
import io
import zipfile
import requests
def get_csv_file(project, api_token, start_date, end_date):
url = "https://api.awrcloud.com/get.php"
params = {'action': 'export_ranking',
'project': project,
'token': api_token,
'startDate': start_date,
'stopDate': end_date}
r = requests.get(url, params)
r.raise_for_status()
return zipfile.ZipFile(io.BytesIO(request.get(r.content).content))
def process_csv_file(zip_file):
contents = zip_file.extractall()
# do stuff with the contents
if __name__ == '__main__':
process_zip_file(get_csv_file('BIMM', 'api-key', '2017-01-01', '2017-09-01'))

Related

Databricks web scraping

I wrote some python code to automatically download a csv file from the internet. The code works when it runs on my local computer but not when I run it on DataBricks. The problem is that I don't know how to save it to my DBFS: folder = "/dbfs/mnt/fmi-import/DNB Scenariosets/". The code does execute but the file is nowhere to be found.
import requests
from bs4 import BeautifulSoup
import re
url_scenariosets_dnb = 'https://www.dnb.nl/voor-de-sector/open-boek-toezicht-sectoren/pensioenfondsen/haalbaarheidstoets/uitvoering-en-normen/scenarioset-haalbaarheidstoets-pensioenfondsen/'
folder = "/dbfs/mnt/fmi-import/DNB Scenariosets/"
class dataset_downloader:
def __init__(self,url):
self.url=url
def scrape(self):
reqs = requests.get(self.url, verify=False)
soup = BeautifulSoup(reqs.text, 'html.parser')
self.urls=[]
for link in soup.find_all('a'):
self.urls.append(link.get('href'))
return self.urls
def filter_scenarioset(self):
# Search data based on regular expression in the list
self.scenarioset_links=[]
[self.scenarioset_links.append('https://www.dnb.nl'+val) for val in self.urls
if re.search(r'hbt-scenarioset-10k', val)]
return self.scenarioset_links
def download_file(self, year, quarter):
try:
self.downloadlink=[]
[self.downloadlink.append(val) for val in self.scenarioset_links
if re.search(r'hbt-scenarioset-10k-{}q{}'.format(year,quarter),val)]
filename='hbt-scenarioset-10k-{}q{}.xlsx'.format(year,quarter)
with requests.get(self.downloadlink[0]) as req:
with open(filename, 'wb') as f:
for chunk in req.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return "/dbfs/mnt/fmi-import/DNB Scenariosets/"+filename
except Exception as e:
print(e)
return None
#%% EXECUTE
download = dataset_downloader(url_scenariosets_dnb)
download.scrape()
download.filter_scenarioset()
download.download_file(2020,2) # select year and quarter
Do you have any suggestion on how you can download a csv file with databricks and save it to a DBFS folder? Thank you in advance!
Vincent
Problem is that in code you have only filename without path to folder. It should be:
with open(folder + filename, 'wb')

Post Large File Using requests_toolbelt to vk

I am new to python, I wrote simple script for uploading video from url to vk, I test this script with small files it's working, but for large files I get run out of memory, I read that using 'requests_toolbelt' it's possible to post large file, How can I add this to my script?
import vk
import requests
from homura import download
import glob
import os
import json
url=raw_input("Enter URL: ")
download(url)
file_name = glob.glob('*.mp4')[0]
session = vk.Session(access_token='TOKEN')
vkapi = vk.API(session,v='5.80' )
params={'name' : file_name,'privacy_view' : 'nobody', 'privacy_comment' : 'nobody'}
param = vkapi.video.save(**params)
upload_url = param['upload_url']
print ("Uploading ...")
request = requests.post(upload_url, files={'video_file': open(file_name, "rb")})
os.remove (file_name)
requests_toolbelt (https://github.com/requests/toolbelt) has just the example that might work for you:
import requests
from requests_toolbelt import MultipartEncoder
...
...
m=MultipartEncoder( fields={'video_file':(file_name, open(file_name, "rb"))})
response = requests.post(upload_url, data=m, headers={'Content-Type': m.content_type})
If you know your video file's MIME type, you can add it as a 3-rd item in the () tuple like this:
m=MultipartEncoder( fields={
'video_file':(file_name, open(file_name,"rb"), "video/mp4")})

Print JSON data from csv list of multiple urls

Very new to Python and haven't found specific answer on SO but apologies in advance if this appears very naive or elsewhere already.
I am trying to print 'IncorporationDate' JSON data from multiple urls of public data set. I have the urls saved as a csv file, snippet below. I am only getting as far as printing ALL the JSON data from one url, and I am uncertain how to run that over all of the csv urls, and write to csv just the IncorporationDate values.
Any basic guidance or edits are really welcomed!
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
import json
def get_jsonparsed_data(url):
response = urlopen(url)
data = response.read().decode("utf-8")
return json.loads(data)
url = ("http://data.companieshouse.gov.uk/doc/company/01046514.json")
print(get_jsonparsed_data(url))
import csv
with open('test.csv') as f:
lis=[line.split() for line in f]
for i,x in enumerate(lis):
print ()
import StringIO
s = StringIO.StringIO()
with open('example.csv', 'w') as f:
for line in s:
f.write(line)
Snippet of csv:
http://business.data.gov.uk/id/company/01046514.json
http://business.data.gov.uk/id/company/01751318.json
http://business.data.gov.uk/id/company/03164710.json
http://business.data.gov.uk/id/company/04403406.json
http://business.data.gov.uk/id/company/04405987.json
Welcome to the Python world.
For dealing with making http requests, we commonly use requests because it's dead simple api.
The code snippet below does what I believe you want:
It grabs the data from each of the urls you posted
It creates a new CSV file with each of the IncorporationDate keys.
```
import csv
import requests
COMPANY_URLS = [
'http://business.data.gov.uk/id/company/01046514.json',
'http://business.data.gov.uk/id/company/01751318.json',
'http://business.data.gov.uk/id/company/03164710.json',
'http://business.data.gov.uk/id/company/04403406.json',
'http://business.data.gov.uk/id/company/04405987.json',
]
def get_company_data():
for url in COMPANY_URLS:
res = requests.get(url)
if res.status_code == 200:
yield res.json()
if __name__ == '__main__':
for data in get_company_data():
try:
incorporation_date = data['primaryTopic']['IncorporationDate']
except KeyError:
continue
else:
with open('out.csv', 'a') as csvfile:
writer = csv.writer(csvfile)
writer.writerow([incorporation_date])
```
First step, you have to read all the URLs in your CSV
import csv
csvReader = csv.reader('text.csv')
# next(csvReader) uncomment if you have a header in the .CSV file
all_urls = [row for row in csvReader if row]
Second step, fetch the data from the URL
from urllib.request import urlopen
def get_jsonparsed_data(url):
response = urlopen(url)
data = response.read().decode("utf-8")
return json.loads(data)
url_data = get_jsonparsed_data("give_your_url_here")
Third step:
Go through all URLs that you got from CSV file
Get JSON data
Fetch the field what you need, in your case "IncorporationDate"
Write into an output CSV file, I'm naming it as IncorporationDates.csv
Code below:
for each_url in all_urls:
url_data = get_jsonparsed_data(each_url)
with open('IncorporationDates.csv', 'w' ) as abc:
abc.write(url_data['primaryTopic']['IncorporationDate'])

Check multiple url from csv if valid or not, using python

I have this script works if I hard code the link in script itself. But wish to take multiple urls from a csv file having this column say url_to_check, need to validate all of them one by one if these urls are valid or not. Please help. Thanks
import httplib
from urlparse import urlparse
def checkUrl(url):
p = urlparse(url)
conn = httplib.HTTPConnection(p.netloc)
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status < 400
if __name__ == '__main__':
print checkUrl('http://www.stackoverflow.com')
You can use python's csv module for parsing your csv file.
A simple example using your example column name and checkUrl function:
import csv
with open('/path/to/your/csv/file') as fobj:
reader = csv.DictReader(fobj)
for row in reader:
valid = checkUrl(row['url_to_check'])
print('%s is %svalid' % (row['url_to_check'], '' if valid else 'in'))

How do I download a zip file in python using urllib2?

Two part question. I am trying to download multiple archived Cory Doctorow podcasts from the internet archive. The old one's that do not come into my iTunes feed. I have written the script but the downloaded files are not properly formatted.
Q1 - What do I change to download the zip mp3 files?
Q2 - What is a better way to pass the variables into URL?
# and the base url.
def dlfile(file_name,file_mode,base_url):
from urllib2 import Request, urlopen, URLError, HTTPError
#create the url and the request
url = base_url + file_name + mid_url + file_name + end_url
req = Request(url)
# Open the url
try:
f = urlopen(req)
print "downloading " + url
# Open our local file for writing
local_file = open(file_name, "wb" + file_mode)
#Write to our local file
local_file.write(f.read())
local_file.close()
#handle errors
except HTTPError, e:
print "HTTP Error:",e.code , url
except URLError, e:
print "URL Error:",e.reason , url
# Set the range
var_range = range(150,153)
# Iterate over image ranges
for index in var_range:
base_url = 'http://www.archive.org/download/Cory_Doctorow_Podcast_'
mid_url = '/Cory_Doctorow_Podcast_'
end_url = '_64kb_mp3.zip'
#create file name based on known pattern
file_name = str(index)
dlfile(file_name,"wb",base_url
This script was adapted from here
Here's how I'd deal with the url building and downloading. I'm making sure to name the file as the basename of the url (the last bit after the trailing slash) and I'm also using the with clause for opening the file to write to. This uses a ContextManager which is nice because it will close that file when the block exits. In addition, I use a template to build the string for the url. urlopen doesn't need a request object, just a string.
import os
from urllib2 import urlopen, URLError, HTTPError
def dlfile(url):
# Open the url
try:
f = urlopen(url)
print "downloading " + url
# Open our local file for writing
with open(os.path.basename(url), "wb") as local_file:
local_file.write(f.read())
#handle errors
except HTTPError, e:
print "HTTP Error:", e.code, url
except URLError, e:
print "URL Error:", e.reason, url
def main():
# Iterate over image ranges
for index in range(150, 151):
url = ("http://www.archive.org/download/"
"Cory_Doctorow_Podcast_%d/"
"Cory_Doctorow_Podcast_%d_64kb_mp3.zip" %
(index, index))
dlfile(url)
if __name__ == '__main__':
main()
An older solution on SO along the lines of what you want:
download a zip file to a local drive and extract all files to a destination folder using python 2.5
Python and urllib

Categories