Loop for converting multiple files from csv.gz to csv - python

I have several csv.gzip files I am trying to convert and save to csv files. I'm able to do it for an individual file using:
with gzip.open('Pool.csv.gz') as f:
Pool= pd.read_csv(f)
Pool.to_csv("Pool.csv")
I'm trying to create a loop to convert all files in directory but I'm failing. Here is my code:
import gzip
import glob
import os
os.chdir('/home/path')
extension = 'csv.gz'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
for i in range(len(all_filenames)): gzip.open(i) as f:
pool_1 = pd.read_csv(f)

You can use os.listdir() to create your list of files and then loop through it:
import os
import gzip
import pandas as pd
dir_path = "/home/path"
all_files = [f for f in os.listdir(dir_path) if f.endswith('csv.gz')]
for file in all_files:
with gzip.open(f"{dir_path}/{file}") as f:
df = pd.read_csv(f)
df.to_csv(f"{file.split('.')[0]}.csv")

Related

All Headers get put into one column after CSV to Excel Conversion with Python?

So i'm having multiple CSVs which i combined using this script:
import os
import glob
import numpy as np
import pandas as pd
#set working directory
os.chdir("Path to CSVs")
#find all csv files in the folder
#use glob pattern matching -> extension = 'csv'
#save result in list -> all_filenames
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#print(all_filenames)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv("mz_all.csv", index=False, encoding='utf-8-sig')
What i want to do is convert it into an xlsx file, what i already did BUT all of the headers from the csv get put into one column which looks like an absolute mess. The code for the conversion looks like this:
# Reading the csv file
df_new = pd.read_csv('mz_all.csv')
# saving xlsx file
GFG = pd.ExcelWriter('MZ_EXCEL.xlsx')
df_new.to_excel(GFG, index=False)
GFG.save()
Here's how the excel is looking atm, as you can see all headers got pushed into the first column but instead i just want it to be organized like it was in the csv
Have you tried to save it to Excel immediately like this?
import os
import glob
import numpy as np
import pandas as pd
#set working directory
os.chdir("Path to CSVs")
#find all csv files in the folder
#use glob pattern matching -> extension = 'csv'
#save result in list -> all_filenames
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#print(all_filenames)
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f, sep=';') for f in all_filenames ])
combined_csv.to_excel('MZ_EXCEL.xlsx', index=False)

How to import a combined CSV file as a dataset under the subgroup /ds1?

I am trying to collect all CSV files into one HDF5 and import them as a dataset under the subgroup ds1. I tried the following code but I don't get what I want:
import h5py
import numpy.random
import os
import glob
import pandas as pd
os.chdir("/root/Desktop/file/data/dataset/ds1")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
#Create a HDF5 file
xxx3 = "xxx3.h5py";
xxx3 = h5py.File(xxx3, "w");
file=pd.HDFStore('/root/Desktop/file/data/dataset/ds1','w')
IR= xxx3.create_group("/root/Desktop/file/data/dataset/ds1");
XAFS = xxx3.create_group("/root/Desktop/file/data/dataset/ds2");
combined_csv.csv=pd.read_csv('/root/Desktop/file/data/dataset/ds1combined_csv.csv')
file.put('combined_csv.csv',combined_csv.csv,format='table',data_columns=True)
xxx3.close()
I noticed you only create group objects in the H5 file xxx3 in the code above. (xxx3.create_group())
This is what your code would like if you read the data with NumPy and load into HDF5 with h5py. The np.genfromtxt() arguments depend on the contents of your CSV. You may need to adjust based on your data.
#Create a HDF5 file
import numpy as np
xxx3 = h5py.File("xxx3.h5py", "w");
rec_arr = np.genfromtxt("/root/Desktop/file/data/dataset/ds1combined_csv.csv",delimiter=',' ,names=True,encoding=None)
IR= xxx3.create_dataset("/ds1",data=rec_arr)
xxx3.close()
There's another example here:
SO 55576601

Import and append pickle files

How could I import and append all files in a directory?
files = os.listdir(r"C:\Users\arv\Desktop\pickle_files")
data = []
for i in files:
data.append(pd.read_pickle(i))
df = pd.concat(['data'])  
Almost like you tried to do it yourslf:
df = pd.concat([pd.read_pickle(f) for f in files])

How can I add filename of imported txt files to dataframe in python

I have imported a few thousand txt files from a folder into pandas dataframe. Is there any way I can create a column adding a sub-string from the filenames of the imported txt files in it? This is to identify each text file in the dataframe by a unique name.
Text files are named as 1001example.txt, 1002example.txt, 1003example.txt and son on. I want something like this:
filename text
1001 this is an example text
1002 this is another example text
1003 this is the last example text
....
The code I have used to import the data is below. However, I do not know how to create a column by a sub-string of filenames. Any help would be appreciated. Thanks.
import glob
import os
import pandas as pd
file_list = glob.glob(os.path.join(os.getcwd(), "K:\\text_all", "*.txt"))
corpus = []
for file_path in file_list:
with open(file_path, encoding="latin-1") as f_input:
corpus.append(f_input.read())
df = pd.DataFrame({'text':corpus})
This should work. It takes numbers from file name.
import glob
import os
import pandas as pd
file_list = glob.glob(os.path.join(os.getcwd(), "K:\\text_all", "*.txt"))
corpus = []
files = []
for file_path in file_list:
with open(file_path, encoding="latin-1") as f_input:
corpus.append(f_input.read())
files.append(''.join([n for n in os.path.basename(file_path) if n.isdigit()]))
df = pd.DataFrame({'file':files, 'text':corpus})
There is a one-liner:
df = pd.concat([pd.read_csv(f, encoding='latin-1').
assign(Filename=os.path.basename(f)) for f in glob.glob('K:\\text_all*.txt')])
df['Filename'] = df['Filename'].str.extract('(\d+)').astype(int)

Read multiple csv files zipped in one file

I have several csv files in several zip files in on folder, so for example:
A.zip (contains csv1,csv2,csv3)
B.zip (contains csv4, csv5, csv6)
which are in the folder path C:/Folder/, when I load normal csv files in a folder I use the following code:
import glob
import pandas as pd
files = glob.glob("C/folder/*.csv")
dfs = [pd.read_csv(f, header=None, sep=";") for f in files]
df = pd.concat(dfs,ignore_index=True)
followed by this post: Reading csv zipped files in python
One csv in zip works like this:
import pandas as pd
import zipfile
zf = zipfile.ZipFile('C:/Users/Desktop/THEZIPFILE.zip')
df = pd.read_csv(zf.open('intfile.csv'))
Any idea how to optimize this loop for me?
Use zip.namelist() to get list of files inside the zip
Ex:
import glob
import zipfile
import pandas as pd
for zip_file in glob.glob("C/folder/*.zip"):
zf = zipfile.ZipFile(zip_file)
dfs = [pd.read_csv(zf.open(f), header=None, sep=";") for f in zf.namelist()]
df = pd.concat(dfs,ignore_index=True)
print(df)
I would try to tackle it in two passes. First pass, extract the contents of the zipfile onto the filesystem. Second Pass, read all those extracted CSVs using the method you already have above:
import glob
import pandas as pd
import zipfile
def extract_files(file_path):
archive = zipfile.ZipFile(file_path, 'r')
unzipped_path = archive.extractall()
return unzipped_path
zipped_files = glob.glob("C/folder/*.zip")]
file_paths = [extract_files(zf) for zf in zipped_files]
dfs = [pd.read_csv(f, header=None, sep=";") for f in file_paths]
df = pd.concat(dfs,ignore_index=True)

Categories