How to download a csv file in Python - python

I am trying to download a csv file from the url
https://qubeshub.org/publications/1220/supportingdocs/1#supportingdocs .
the file is Elephant Morphometrics and Tusk Size-originaldata-3861.csv
I have tried using using pd.read_csv()
and
import pandas as pd
import io
import requests
url="https://qubeshub.org/publications/1220/supportingdocs/1#supportingdocs/Elephant Morphometrics and Tusk Size-originaldata-3861.csv"
s=requests.get(url).content
c=pd.read_csv(io.StringIO(s.decode('utf-8')))

Try:
import requests
url = "https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1"
r = requests.get(url)
filename = r.headers["Content-Disposition"].split('"')[1]
with open(filename, "wb") as f_out:
print(f"Downloading {filename}")
f_out.write(r.content)
Prints:
Downloading Elephant Morphometrics and Tusk Size-originaldata-3861.csv
and saves the file.

This should download the file and parse the rows and columns into a csv file
import requests
import csv
url = "https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1"
req=requests.get(url)
rows = req.content.decode('utf-8').split("\r\n")
rows.pop()
csv_local_filename = "test.csv"
with open(csv_local_filename, 'w') as fs:
writer = csv.writer(fs, delimiter = ',')
for row in rows:
entries = row.split(',')
b=writer.writerow(entries)
You'll likely want to convert those columns into the desired types before you start working with them. The example code above leaves everything as a string.
After I run the above code I see:
>tail test.csv
2005-13,88,m,32.5,290,162.3,40
2005-13,51,m,37.5,270,113.2,40
2005-13,86,m,37.5,310,175.3,38
and
>head test.csv
Years of sample collection,Elephant ID,Sex,Estimated Age (years),shoulder Height in cm,Tusk Length in cm,Tusk Circumference in cm
1966-68,12,f,0.08,102,,
1966-68,34,f,0.08,89,,
1966-68,162,f,0.083,89,,
1966-68,292,f,0.083,92,,

In Firefox after downloading file in browser you can check link to this file and it shows
https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1
and this link you should use in code
import pandas as pd
df = pd.read_csv('https://qubeshub.org/publications/1220/serve/1/3861?el=1&download=1')
print(df)

Related

how to convert json file to csv with "success":true

I have problem with convert json file to csv file on python
and i think it will be the nested json file but i don't know how to handle it!
import json, requests
url = requests.get("https://####/api/food_orders")
text = url.text
data = json.load(text)
order_data = data['data']
# now we will open a file for writing
data_file = open('ordersJsonToCsv.csv', 'w', newline='')
# create the csv writer object
csv_writer = csv.writer(data_file)
# Counter variable used for writing
# headers to the CSV file
count = 0
for ord in order_data:
if count == 0:
# Writing headers of CSV file
header = ord.keys()
csv_writer.writerow(header)
count += 1
# Writing data of CSV file
csv_writer.writerow(ord.values())
data_file.close()
And Json file look like
This code will solve the problem to get data only
import pandas as pd
import json, requests
url = requests.get("https://##/api/orders?
text = url.text
info = json.loads(text)
df = pd.json_normalize(info['data'])
df.to_csv("samplecsv.csv")

Extracting Data from Multiple PDFs'

I am trying to extract data from PDF document and have regarding that - I was able to get the code working for one single PDF. However, is there a way I can point the code to a folder with multiple PDF's and get the extract out in CSV? I am a complete beginner in Python, so any help will be appreciated. Below is the current code that I have.
import pdfplumber
import pandas as pd
file = 'Test Slip.pdf'
lines = []
with pdfplumber.open(file) as pdf:
pages = pdf.pages
for page in pdf.pages:
text = page.extract_text()
for line in text.split('\n'):
lines.append(line)
print(line)
df = pd.DataFrame(lines)
df.to_csv('test.csv')
One possible option would be to use os.listdir and only read files that end in .pdf:
import os
folder_with_pdfs = '/path/to/folder'
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
# do pdf reading with opening pdf_file_path
I am not sure why you aim to write lines to a dataframe as rows but this should be what you need:
import pdfplumber
import pandas as pd
import os
def extract_pdf(pdf_path):
linesOfFile = []
with pdfplumber.open(pdf_path) as pdf:
for pdf_page in pdf.pages:
single_page_text = pdf_page.extract_text()
for linesOfFile in single_page_text.split('\n'):
linesOfFile.append(line)
#print(linesOfFile)
return linesOfFile
folder_with_pdfs = 'folder_path'
linesOfFiles = []
for pdf_file in os.listdir(folder_with_pdfs):
if pdf_file.endswith('.pdf'):
pdf_file_path = os.path.join(folder_with_pdfs, pdf_file)
linesOfFile = extract_pdf(pdf_file_path)
linesOfFiles.append(linesOfFile)
df = pd.DataFrame(linesOfFiles)
df.to_csv('test.csv')

Export PDF to csv using python (tabula)

When exporting a PDF file to csv, it returns an error:writeheader() takes 1 positional argumentbut 2 were given
from tabula import read_pdf
from tabulate import tabulate
import csv
df = read_pdf("asd.pdf")
print(df)
with open('ddd.csv', "w", newline="") as file:
columns = ['specialty ',"name",'number_of_seats','Total_seats,' "document_type", "concent"]
writer = csv.DictWriter(file, fieldnames=columns)
writer.writeheader(df)
Code copied from http://theautomatic.net/2019/05/24/3-ways-to-scrape-tables-from-pdfs-with-python/, there is also more details ...
import tabula
file = "http://lab.fs.uni-lj.si/lasin/wp/IMIT_files/neural/doc/seminar8.pdf"
#tables = tabula.read_pdf(file, pages = "all", multiple_tables = True)
# output just the first table in the PDF to a CSV
tabula.convert_into(file, "output.csv", output_format="csv")
# output all the tables in the PDF to a CSV
tabula.convert_into(file, "output.csv", output_format="csv", pages='all')

How to read from a csv file in zip folder and save data from csv file in database?

import glob
import os
import csv
import zipfile
from io import StringIO
for name in glob.glob('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'):
base = os.path.basename(name)
filename = os.path.splitext(base)[0]
datadirectory = 'C:/Users/RAMESH SANTHA/Downloads/'
dataFile = filename
archive = '.'.join([dataFile, 'zip'])
fullpath = ''.join([datadirectory, archive])
csv_file = '.'.join([dataFile, 'csv']) #all fixed
filehandle = open(fullpath, 'rb')
zfile = zipfile.ZipFile(filehandle)
data = StringIO.StringIO(zfile.read(csv_file))
reader = csv.reader(data)
for row in reader:
print (row)
I tried following code to read data from zip folder which contains csv file and print rows but got error:
data = StringIO.StringIO(zfile.read(csv_file))
AttributeError: type object '_io.StringIO' has no attribute 'StringIO'
There is no StringIO.StringIO() but io.StringIO()
import io
data = io.StringIO(...)
With your import it will be even without io.
from io import StringIO
data = StringIO(...)
BTW: I think you overcomplicated code using glob and join(). And you can use filename directly with ZipFile without open()
import os
import csv
import zipfile
import io
zip_fullname = 'C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip'
zip_file = os.path.basename(zip_fullname)
csv_file = zip_file.replace('.zip', '.csv')
print(zip_file) # download-NIFTY 50-01012020.zip
print(csv_file) # download-NIFTY 50-01012020.csv
zfile = zipfile.ZipFile(zip_fullname)
data = io.StringIO(zfile.read(csv_file).decode('utf-8')) # bytes needs to be converted to string
reader = csv.reader(data)
for row in reader:
print(row)
But with pandas it should be even simpler
import pandas as pd
df = pd.read_csv('C:/Users/RAMESH SANTHA/Downloads/download-NIFTY 50-01012020.zip')
print(df)
Looking at the script you getting error opening the csv file from zip file. Below is python 3 code that I have working for a zip file having few csv's. The directory to extract should exist before you run the script
import zipfile
path_to_zip_file='/tmp/test1.zip' # Assuming this file exist , This path is from mac, but should work for windows as well'
directory_to_extract_to='/tmp/extract/' # Assuming this directory already exist
import csv,os
import codecs
import glob
with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
zip_ref.extractall(directory_to_extract_to)
for file in glob.glob(directory_to_extract_to+'*.csv'):
path = os.path.join(directory_to_extract_to,file)
with open(path, 'rb') as f:
reader = csv.reader(codecs.iterdecode(f, 'utf-8'))
# Below code is print them as arrays
# for row in reader:
# print(row)
# Reading rows as ordered dictionary
dictReader = csv.DictReader(codecs.iterdecode(f, 'utf-8'))
for row in dictReader:
print(row)

How to write csv file in html?

I have read file of csv but I have a problem that how to read CSV file and save it in table.html?
import csv
html_about = ''
names = []
with open('filo.csv') as data_file:
csv_data = csv.reader(data_file)
for line in csv_data:
names.append(f'{line[0]}')
html_output = '\n<ul>'
for name in names:
html_output += f'\n\t<li>{name}</li>'
html_output += '\n</ul>'
from prettytable import PrettyTable
x = PrettyTable(line[0])
html_code = x.get_html_string()
html_file = open('table.html','w')
html_file = html_file.write(html_code)
I suggest you use pandas library,
it has pd.read_csv, and also pd.to_html
usage should look like this, let me know if this works for you:
import pandas as pd
df = pd.read_csv('filo.csv')
with open('table.html', 'w') as html_file:
df.to_html(html_file)

Categories