Panda's Write CSV - Append vs. Write - python

I would like to use pd.write_csv to write "filename" (with headers) if "filename" doesn't exist, otherwise to append to "filename" if it exists. If I simply use command:
df.to_csv('filename.csv',mode = 'a',header ='column_names')
The write or append succeeds, but it seems like the header is written every time an append takes place.
How can I only add the header if the file doesn't exist, and append without header if the file does exist?

Not sure there is a way in pandas but checking if the file exists would be a simple approach:
import os
# if file does not exist write header
if not os.path.isfile('filename.csv'):
df.to_csv('filename.csv', header='column_names')
else: # else it exists so append without writing the header
df.to_csv('filename.csv', mode='a', header=False)

with open(filename, 'a') as f:
df.to_csv(f, mode='a', header=f.tell()==0)
it will add header when writes to the file first time

In Pandas dataframe "to_csv" function, use header=False if csv file exists & append to existing file.
import os
hdr = False if os.path.isfile('filename.csv') else True
df.to_csv('filename.csv', mode='a', header=hdr)

The above solutions are great, but I have a moral obligation to include the pathlib solution here:
from pathlib import Path
file_path = Path(filename)
if file_path.exists():
df.to_csv(file_path, header=False, mode='a')
else:
df.to_csv(file_path, header=True, mode='w')
Alternatively (depending on your inlining preferences):
file_exists = file_path.exists()
df.to_csv(file_path, header=not file_exists, mode='a' if file_exists else 'w')

Apart from file exist check, you can also check for non zero file size. Since it will make sense to add header if file exists but file size is zero i.e file without content. I find it helpful in some exceptional cases
import os.path
header_flag = False if (os.path.exists(fpath) and (os.path.getsize(fpath) > 0)) else True
df.to_csv(fpath, mode='a', index=False, header=header_flag)

In case if you have dict() and want to write and append into CSV file :
import pandas as pd
file_name = 'data.csv'
my_dict = {"column_1":"Apple","column_2":"Mango"}
with open(file_name, 'a') as f:
df = pd.DataFrame(my_dict)
df.to_csv(f, mode='a', header=f.tell()==0)

Related

How to replace the header of all CSV files in a directory?

I have a folder of CSV files, and I need to simple replace the current header (first row), of the csv, with a different header. As an example, ever CSV has: A, B, C, D, E as the first first row header, but I need to be able to change that to whatever I want; i.e., Apple, Orange, Lemon, Pear, Peach || or, || 1, j, er, fd, j5
All the data in each CSV needs to be retained besides the header, and the replacement header will make all headers of all CSVs in the folder identical, per what is indicated in the code.
import shutil
import glob
files = glob.glob("/home/robert/Testing/D1/*.csv")
for i in range(len(files)):
from_file = open(files[i])
to_file = open(files[i], mode="w")
to_file.write("id,t,s,p,date,e")
shutil.copyfileobj(from_file, to_file)
I used this code, however, it deleted all of the other data in the CSV files, which I needed to keep, and only left/created the headers
from glob import glob
from pathlib import Path
def update_folder(folder: Path):
for fname in folder.glob('*.csv'):
with open(fname) as fin:
lines = fin.readlines() # element 0 is A,B,C...
lines[0] = 'Apple,Orange,Lemon\n'
with open(fname, 'w') as fout:
fout.write(''.join(readlines))
I would suggest using the Python's tempfile module to create a temporary file with the changes in it and then, after they're made, it can simply be renamed to replaced the original file. I would also using its csv module to read the original and write the updated version because it fast, debugged, and can handle many varieties of CSV.
Using the combination make the task relatively easy:
import csv
import os
from pathlib import Path
from tempfile import NamedTemporaryFile
CSV_FOLDER = Path('/home/robert/Testing/D1')
NEW_HEADER = 'id,t,s,p,date,e'.split(',')
for filepath in CSV_FOLDER.glob('*.csv'):
with open(filepath, 'r', newline='') as csv_file, \
NamedTemporaryFile('w', newline='', dir=filepath.parent, delete=False) \
as tmp_file:
reader = csv.reader(csv_file)
writer =csv.writer(tmp_file)
next(reader) # Skip header.
writer.writerow(NEW_HEADER) # Replacement.
writer.writerows(reader) # Copy remaining rows of original file.
os.replace(tmp_file.name, filepath) # Replace original file with updated version.
print('CSV files updated')

Python converts multiple JSON files in a folder directory to CSV

I have a lot of JSON files, I put them in my folder, I want to convert them to CSV format,
Should I use import glob? ? I am a novice, how can I modify my codeļ¼Œ
#-*-coding:utf-8-*-
import csv
import json
import sys
import codecs
def trans(path):
jsonData = codecs.open('C:/Users/jeri/Desktop/1', '*.json', 'r', 'utf-8')
# csvfile = open(path+'.csv', 'w')
# csvfile = open(path+'.csv', 'wb')
csvfile = open('C:/Users/jeri/Desktop/1.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(csvfile, delimiter=',')
flag = True
for line in jsonData:
dic = json.loads(line)
if flag:
keys = list(dic.keys())
print(keys)
flag = False
writer.writerow(list(dic.values()))
jsonData.close()
csvfile.close()
if __name__ == '__main__':
path=str(sys.argv[0])
print(path)
trans(path)
Yes using glob would be a good way to iterate through the .json files in your folder! But glob doesn't have anything to do with the reading/writing of files. After importing glob, you can use it like this:
for curr_file in glob.glob("*.json"):
# Process each file here
I see that you've used the json module to read in your code snippet. I'd say the better way to go about it is to use pandas.
df = pd.read_json()
I say this because with the pandas library, you can simply convert from .json to .csv using
df.to_csv('file_name.csv')
Combining the three together, it would look like this:
for curr_file in glob.glob("*.json"):
# Process each file here
df = pd.read_json(curr_file)
df.to_csv('file_name.csv')
Also, note that if your json has nested objects, it can't be directly converted to csv, you'll have to settle the organization of data prior to the conversion.

How to write dataframe to csv [duplicate]

I would like to use pd.write_csv to write "filename" (with headers) if "filename" doesn't exist, otherwise to append to "filename" if it exists. If I simply use command:
df.to_csv('filename.csv',mode = 'a',header ='column_names')
The write or append succeeds, but it seems like the header is written every time an append takes place.
How can I only add the header if the file doesn't exist, and append without header if the file does exist?
Not sure there is a way in pandas but checking if the file exists would be a simple approach:
import os
# if file does not exist write header
if not os.path.isfile('filename.csv'):
df.to_csv('filename.csv', header='column_names')
else: # else it exists so append without writing the header
df.to_csv('filename.csv', mode='a', header=False)
with open(filename, 'a') as f:
df.to_csv(f, mode='a', header=f.tell()==0)
it will add header when writes to the file first time
In Pandas dataframe "to_csv" function, use header=False if csv file exists & append to existing file.
import os
hdr = False if os.path.isfile('filename.csv') else True
df.to_csv('filename.csv', mode='a', header=hdr)
The above solutions are great, but I have a moral obligation to include the pathlib solution here:
from pathlib import Path
file_path = Path(filename)
if file_path.exists():
df.to_csv(file_path, header=False, mode='a')
else:
df.to_csv(file_path, header=True, mode='w')
Alternatively (depending on your inlining preferences):
file_exists = file_path.exists()
df.to_csv(file_path, header=not file_exists, mode='a' if file_exists else 'w')
Apart from file exist check, you can also check for non zero file size. Since it will make sense to add header if file exists but file size is zero i.e file without content. I find it helpful in some exceptional cases
import os.path
header_flag = False if (os.path.exists(fpath) and (os.path.getsize(fpath) > 0)) else True
df.to_csv(fpath, mode='a', index=False, header=header_flag)
In case if you have dict() and want to write and append into CSV file :
import pandas as pd
file_name = 'data.csv'
my_dict = {"column_1":"Apple","column_2":"Mango"}
with open(file_name, 'a') as f:
df = pd.DataFrame(my_dict)
df.to_csv(f, mode='a', header=f.tell()==0)

Apply GZIP compression to a CSV in Python Pandas

I am trying to write a dataframe to a gzipped csv in python pandas, using the following:
import pandas as pd
import datetime
import csv
import gzip
# Get data (with previous connection and script variables)
df = pd.read_sql_query(script, conn)
# Create today's date, to append to file
todaysdatestring = str(datetime.datetime.today().strftime('%Y%m%d'))
print todaysdatestring
# Create csv with gzip compression
df.to_csv('foo-%s.csv.gz' % todaysdatestring,
sep='|',
header=True,
index=False,
quoting=csv.QUOTE_ALL,
compression='gzip',
quotechar='"',
doublequote=True,
line_terminator='\n')
This just creates a csv called 'foo-YYYYMMDD.csv.gz', not an actual gzip archive.
I've also tried adding this:
#Turn to_csv statement into a variable
d = df.to_csv('foo-%s.csv.gz' % todaysdatestring,
sep='|',
header=True,
index=False,
quoting=csv.QUOTE_ALL,
compression='gzip',
quotechar='"',
doublequote=True,
line_terminator='\n')
# Write above variable to gzip
with gzip.open('foo-%s.csv.gz' % todaysdatestring, 'wb') as output:
output.write(d)
Which fails as well. Any ideas?
Using df.to_csv() with the keyword argument compression='gzip' should produce a gzip archive. I tested it using same keyword arguments as you, and it worked.
You may need to upgrade pandas, as gzip was not implemented until version 0.17.1, but trying to use it on prior versions will not raise an error, and just produce a regular csv. You can determine your current version of pandas by looking at the output of pd.__version__.
It is done very easily with pandas
import pandas as pd
Write a pandas dataframe to disc as gunzip compressed csv
df.to_csv('dfsavename.csv.gz', compression='gzip')
Read from disc
df = pd.read_csv('dfsavename.csv.gz', compression='gzip')
From documentation
import gzip
content = "Lots of content here"
with gzip.open('file.txt.gz', 'wb') as f:
f.write(content)
with pandas
import gzip
content = df.to_csv(
sep='|',
header=True,
index=False,
quoting=csv.QUOTE_ALL,
quotechar='"',
doublequote=True,
line_terminator='\n')
with gzip.open('foo-%s.csv.gz' % todaysdatestring, 'wb') as f:
f.write(content)
The trick here being that to_csv outputs text if you don't pass it a filename. Then you just redirect that text to gzip's write method.
with gzip.open('foo-%s.csv.gz' % todaysdatestring, 'wb') as f:
f.write(df.to_csv(sep='|', index=False, quoting=csv.QUOTE_ALL))

Add file name as last column of CSV file

I have a Python script which modifies a CSV file to add the filename as the last column:
import sys
import glob
for filename in glob.glob(sys.argv[1]):
file = open(filename)
data = [line.rstrip() + "," + filename for line in file]
file.close()
file = open(filename, "w")
file.write("\n".join(data))
file.close()
Unfortunately, it also adds the filename to the header (first) row of the file. I would like the string "ID" added to the header instead. Can anybody suggest how I could do this?
Have a look at the official csv module.
Here are a few minor notes on your current code:
It's a bad idea to use file as a variable name, since that shadows the built-in type.
You can close the file objects automatically by using the with syntax.
Don't you want to add an extra column in the header line, called something like Filename, rather than just omitting a column in the first row?
If your filenames have commas (or, less probably, newlines) in them, you'll need to make sure that the filename is quoted - just appending it won't do.
That last consideration would incline me to use the csv module instead, which will deal with the quoting and unquoting for you. For example, you could try something like the following code:
import glob
import csv
import sys
for filename in glob.glob(sys.argv[1]):
data = []
with open(filename) as finput:
for i, row in enumerate(csv.reader(finput)):
to_append = "Filename" if i == 0 else filename
data.append(row+[to_append])
with open(filename,'wb') as foutput:
writer = csv.writer(foutput)
for row in data:
writer.writerow(row)
That may quote the data slightly differently from your input file, so you might want to play with the quoting options for csv.reader and csv.writer described in the documentation for the csv module.
As a further point, you might have good reasons for taking a glob as a parameter rather than just the files on the command line, but it's a bit surprising - you'll have to call your script as ./whatever.py '*.csv' rather than just ./whatever.py *.csv. Instead, you could just do:
for filename in sys.argv[1:]:
... and let the shell expand your glob before the script knows anything about it.
One last thing - the current approach you're taking is slightly dangerous, in that if anything fails when writing back to the same filename, you'll lose data. The standard way of avoiding this is to instead write to a temporary file, and, if that was successful, rename the temporary file over the original. So, you might rewrite the whole thing as:
import csv
import sys
import tempfile
import shutil
for filename in sys.argv[1:]:
tmp = tempfile.NamedTemporaryFile(delete=False)
with open(filename) as finput:
with open(tmp.name,'wb') as ftmp:
writer = csv.writer(ftmp)
for i, row in enumerate(csv.reader(finput)):
to_append = "Filename" if i == 0 else filename
writer.writerow(row+[to_append])
shutil.move(tmp.name,filename)
You can try:
data = [file.readline().rstrip() + ",id"]
data += [line.rstrip() + "," + filename for line in file]
You can try changing your code, but using the csv module is recommended. This should give you the result you want:
import sys
import glob
import csv
filename = glob.glob(sys.argv[1])[0]
yourfile = csv.reader(open(filename, 'rw'))
csv_output=[]
for row in yourfile:
if len(csv_output) != 0: # skip the header
row.append(filename)
csv_output.append(row)
yourfile = csv.writer(open(filename,'w'),delimiter=',')
yourfile.writerows(csv_output)
Use the CSV module that comes with Python.
import csv
import sys
def process_file(filename):
# Read the contents of the file into a list of lines.
f = open(filename, 'r')
contents = f.readlines()
f.close()
# Use a CSV reader to parse the contents.
reader = csv.reader(contents)
# Open the output and create a CSV writer for it.
f = open(filename, 'wb')
writer = csv.writer(f)
# Process the header.
header = reader.next()
header.append('ID')
writer.writerow(header)
# Process each row of the body.
for row in reader:
row.append(filename)
writer.writerow(row)
# Close the file and we're done.
f.close()
# Run the function on all command-line arguments. Note that this does no
# checking for things such as file existence or permissions.
map(process_file, sys.argv[1:])
You can run this as follows:
blair#blair-eeepc:~$ python csv_add_filename.py file1.csv file2.csv
you can use fileinput to do in place editing
import sys
import glob
import fileinput
for filename in glob.glob(sys.argv[1]):
for line in fileinput.FileInput(filename,inplace=1) :
if fileinput.lineno()==1:
print line.rstrip() + " ID"
else
print line.rstrip() + "," + filename

Categories