reading multiple csv and saving into separate dataframes - python

I'm looking to read multiple csv file, and then save them into separate dataframe names.
path = os.getcwd()
csv_files = glob.glob(os.path.join(r'blabla/Data', "*.csv" ))
for f in csv_files:
df = pd.read_csv(f)
print('Location:', f)
print('File Name:', f.split("\\")[-1])
print('Content:')
df.pop('Unnamed: 0')
display(df)
print
When I display(df) within the loop, it displays all three tables in the 3 csv files in that folder. However, when I print df outside the loop, it only gives me the last table that was generated. How do I save each table from the csv file into separate data frames?

seems like you're overwriting the same variable again and again
path = os.getcwd()
csv_files = glob.glob(os.path.join(r'blabla/Data', "*.csv" ))
list_of_dfs = []
for f in csv_files:
df = pd.read_csv(f)
print('Location:', f)
print('File Name:', f.split("\\")[-1])
print('Content:')
df.pop('Unnamed: 0')
display(df)
list_of_dfs.append(df)
access the individual dataframes with list_of_dfs[0], list_of_dfs[1],...

Related

Extract single column from multiple CSVs and save to new CSV

I would like to read out a specific column from over 100 CSV files to create a new CSV file. The source column's header will be renamed with the filename the column is extracted from.
I can get the individual columns, but I have been unable to rename each column's header without the ".csv" extension:
import os
import pandas as pd
folder = "C:/Users/Doc/Data"
files = os.scandir(folder)
E2080 = []
with os.scandir(folder) as files:
for file in files:
#print(file)
df = pd.read_csv(file, index_col=None)
dist = {file: (df['lnt_dist'])}
E = pd.DataFrame(dist)
E2080.append(E)
dist = pd.concat(E2080, ignore_index=False, axis=1)
dist.head()
dist.to_csv('E2080', index=False)
This is the final code that worked for me (see output 1):
E2080 = []
with os.scandir(folder) as files:
for file in files:
#print(file)
df = pd.read_csv(file, index_col=None)
dist = {file: (df['lnt_dist'])}
E = pd.DataFrame(dist)
E_1 = E.rename(columns={file: file.name.split('.')[0]}) # rename df header while dropping the ext **[.csv]** and the `os.scandir` attribute `<DirEntry>`
E2080.append(E_1)
dist = pd.concat(E_28, ignore_index=False, axis=1)
#dist.head()
dist.to_csv('E2080.csv', index=False)
You should use file.name instead of file to get string with name.
And with string you can use .split(".") to get name without extension.
for file in os.scandir(folder):
print(file.name, '=>', file.name.split(".")[0])
Or you could use pathlib.Path instead of os.scandir() to have more functions.
for file in pathlib.Path('test').iterdir():
print(file.name, '=>', file.stem)

Creating one unique file from many others saved in a folder

I've a list of csv files (approx. 100) that I'd like to include in one single csv file.
The list is found using
PATH_DATA_FOLDER = 'mypath/'
list_files = os.listdir(PATH_DATA_FOLDER)
for f in list_files:
list_columns = list(pd.read_csv(os.path.join(PATH_DATA_FOLDER, f)).columns)
df = pd.DataFrame(columns=list_columns)
print(df)
Which returns the files (it is just a sample, since I have 100 and more files):
['file1.csv', 'name2.csv', 'example.csv', '.DS_Store']
This, unfortunately, includes also hidden files, that I'd like to exclude.
Each file has the same columns:
Columns: [Name, Surname, Country]
I'd like to find a way to create one unique file with all these fields, plus information of the original file (e.g., adding a new column with the file name).
I've tried with
df1 = pd.read_csv(os.path.join(PATH_DATA_FOLDER, f))
df1['File'] = f # file name
df = df.append(df1)
df = df.reset_index(drop=True).drop_duplicates() # I'd like to drop duplicates in both Name and Surname
but it returns a dataframe with the last entry, so I guess the problem is in the for loop.
I hope you can provide some help.
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#drop duplicates and reset index
combined_csv.drop_duplicates().reset_index(drop=True)
#Save the combined file
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')
Have you tried using glob?
filenames = glob.glob("mypath/*.csv") #list of all you csv files.
df = pd.DataFrame(columns=["Name", "Surname", "Country"])
for filename in filenames:
df = df.append(pd.read_csv(filename))
df = df.drop_duplicates().reset_index(drop=True)
Another way would be concatenating the csv files using the cat command after removing the headers and then read the concatenated csv file using pd.read_csv.

Read multiple CSV files then rename files based on the filenames

Currently the below code reads all the csv files in the path, then saved in a list.
I want to save each dataframe with the name of the filename e.g. echo.csv
path = r'M:\Work\Experimental_datasets\device_ID\IoT_device_captures\packet_header_features' # use your path
all_files = glob.glob(os.path.join(path, "*.csv"))
li = []
for filename in all_files:
df = pd.read_csv(filename, skiprows=15, sep='[|]',
skipfooter=2, engine='python', header=None,
names=["sum_frame_len","avg_frame_len","max_frame_len","sum_ip_len"],
usecols=[2,3,4,5]
)
li.append(df)
The output I get is a list of dataframes - but I want each of these dataframes with the name of the filename e.g. echo
How do I access each dataframe from the dictionary
As you mentioned a dictionary would be useful for this task. For example:
import os
all_files = glob.glob(os.path.join(path, "*.csv"))
df_dict = {}
for filename in all_files:
df = pd.read_csv(filename, skiprows=15, sep='[|]',
skipfooter=2, engine='python', header=None,
names=["sum_frame_len","avg_frame_len","max_frame_len","sum_ip_len"],
usecols=[2,3,4,5]
)
name = os.path.basename(filename).split('.')[0]
df_dict[name] = df
What you will be left with is the dictionary df_dict where the keys correspond to the name of the file and the value corresponds to the data within a given file.
You can view all the keys in the dictionary with df_dict.keys() and select a given DataFrame with df_dict[key].

Read multiple excel files to multiple variables (variable name from the file itself)

I have multiple excel files:
import os
files = os.listdir()
#list excel files in the folder
files_xlsx = [f for f in files if f[-4:] == 'xlsx']
#sort
files_xlsx.sort()
#remove the extension
for i in range(len(files_xlsx)):
files_xlsx[i] = files_xlsx[i][:-5]
files_xlsx
['Microsoft_Excel_Worksheet',
'Microsoft_Excel_Worksheet1',
'slide2_chart_rId3_object_rId1',
'slide3_chart_rId2_object_rId1',
'slide3_chart_rId3_object_rId1',
'slide4_chart_rId2_object_rId1',
'slide4_chart_rId3_object_rId1',
'slide5_chart_rId3_object_rId1',
'slide6_chart_rId2_object_rId1']
I'd like to read the files using pandas and save each dataframe to a variable:
import pandas as pd
??? how to loop this ???
Microsoft_Excel_Worksheet = pd.read_excel(files_xlsx[0] + '.xlsx'), index_col='Unnamed: 0')
Microsoft_Excel_Worksheet1 = pd.read_excel(files_xlsx[1] + '.xlsx'), index_col='Unnamed: 0')
slide2_chart_rId3_object_rId1 = pd.read_excel(files_xlsx[2] + '.xlsx'), index_col='Unnamed: 0')
I don't know how to loop the procedure. Thanks in advance for the help!
what you can do is read xlsx files to dataframes and append to a combined list
basepath = <basepath>
files = list(filter(lambda x: '. xlsx' in x, os.listdir(basepath)))
alldf = []
for f in files:
df= pd.read_excel(f"{basepath}/{f}",index_col='Unnamed: 0')
alldf.append(df)

Reading in a list of files into a list of DataFrames

I'm trying to read a list of files into a list of Pandas DataFrames in Python. However, the code below doesn't work.
files = [file1, file2, file3]
df1 = pd.DataFrame()
df2 = pd.DataFrame()
df3 = pd.DataFrame()
dfs = [df1, df2, df3]
# Read in data files
for file,df in zip(files, dfs):
if file_exists(file):
with open(file, 'rb') as in_file:
df = pd.read_csv(in_file, low_memory=False)
print df #the file is getting read properly
print df1 #empty
print df2 #empty
print df3 #empty
How to I get the original DataFrames to update if I pass them into a for-loop as a list of DataFrames?
Try this:
dfs = [pd.read_csv(f, low_memory=False) for f in files]
if you want to check whether file exists:
import os
dfs = [pd.read_csv(f, low_memory=False) for f in files if os.path.isfile(f)]
and if you want to concatenate all of them into one data frame:
df = pd.concat([pd.read_csv(f, low_memory=False)
for f in files if os.path.isfile(f)],
ignore_index=True)
You are not working on the list elements themselves when iterating over them but you are not operating on the list.
You need to insert the elements (or append them) to the list. One possibility could be:
files = [file1, file2, file3]
dfs = [None] * 3 # Just a placeholder
# Read in data files
for i, file in enumerate(files): # Enumeration instead of zip
if file_exists(file):
with open(file, 'rb') as in_file:
dfs[i] = pd.read_csv(in_file, low_memory=False) # Setting the list element
print dfs[i] #the file is getting read properly
This updates the list elements and should work.
Your code seems over complicated you can just do:
files = [file1, file2, file3]
dfs = []
# Read in data files
for file in files:
if file_exists(file):
dfs.append(pd.read_csv(file, low_memory=False))
You will end up with a list of dfs as desired
You can try list comprehension:
files = [file1, file2, file3]
dfs = [pd.read_csv(x, low_memory=False) for x in files if file_exists(x)]
Custom-written Python function that appropriately handles both CSV & JSON files.
def generate_list_of_dfs(incoming_files):
"""
Accepts a list of csv and json file/path names.
Returns a list of DataFrames.
"""
outgoing_files = []
for filename in incoming_files:
file_extension = filename.split('.')[1]
if file_extension == 'json':
with open(filename, mode='r') as incoming_file:
outgoing_json = pd.DataFrame(json.load(incoming_file))
outgoing_files.append(outgoing_json)
if file_extension == 'csv':
outgoing_csv = pd.read_csv(filename)
outgoing_files.append(outgoing_csv)
return outgoing_files
How to Call this Function
import pandas as pd
import json
files_to_be_read = ['filename1.json', 'filename2.csv', 'filename3.json', 'filename4.csv']
dataframes_list = generate_list_of_dfs(files_to_be_read)
Here is a simple solution that avoids using a list to hold all the data frames, if you don't need them in a list.
import fnmatch
# get the CSV files only
files = fnmatch.filter(os.listdir('.'), '*.csv')
files
Output which is now a list of the names:
['Feedback Form Submissions 1.21-1.25.22.csv',
'Feedback Form Submissions 1.21.22.csv',
'Feedback Form Submissions 1.25-1.31.22.csv']
Now create a simple list of new names to make working with them easier:
# use a simple format
names = []
for i in range(0,len(files)):
names.append('data' + str(i))
names
['data0', 'data1', 'data2']
You can use any list of names that you want. The next step take the file names and the list of names and then assign them to the names.
# i is the incrementor for the list of names
i = 0
# iterate through the file names
for file in files:
# make an empty dataframe
df = pd.DataFrame()
# load the first file in
df = pd.read_csv(file, low_memory=False)
# get the first name from the list, this will be a string
new_name = names[i]
# assign the string to the variable and assign it to the dataframe
locals()[new_name] = df.copy()
# increment the list of names
i = i + 1
You now have 3 separate dataframes named data0, data1, data2, and do commands like
data2.info()

Categories