Open multiple Excel files to separate Pandas dataframes - python

Brand new to Python and could use some help importing multiple Excel files to separate Pandas dataframes. I have successfully implemented the following code, but of course it imports everything into one frame. I would like to import them into df1, df2, df3, df4, df5, etc.
Anything helps, thank you!
import pandas as pd
import glob
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df = pd.DataFrame()
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
print(df)
print(number_of_files)
get_files()

The easiest way to do that is to use a list. Each element of the list is a dataframe
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = []
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df_list.append(data)
print(df_list)
print(number_of_files)
return df_list
get_files()
You can then access your dataframes with df_list[0], df_list[1]...

Just as another option by Jezrael answer here https://stackoverflow.com/a/52074347/13160821 but modified for your code.
from os.path import basename
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = {basename(f) : pd.read_excel(f, 'Sheet1') for f in filenames}
print(number_of_files)
return df_list
get_files()
Which can then be accessed by the filename eg. dfs['file_name1.xlsx'] or dfs['some_file.xlsx']. You can also do things like splitext to remove the xlsx from the key or use just part of the filename.

Related

Concat dataframes with same names from multiple folders using pandas

I have three folders folder1, folder2, and folder3. They have data frames as follows:
folder1/
df1.csv
df4.csv
df5.csv
folder2/
df1.csv
df3.csv
df4.csv
folder3/
df4.csv
I am confused about how to contact the data frames using pandas.concat() with the same names in all three folders and save them in a new folder "finalfolder" such that the finalfolder contains concatinated files:
finalfolder/
df1.csv (concat from folder1 and folder2)
df3.csv (From folder 2)
df4.csv (concat from folder1, 2, and 3)
df5.csv (From folder 1)
edit to first answer:
from os import listdir
import pandas as pd
folder_paths = ['put all the folder paths here']
df_dict = {'folder': [], 'file': []}
for folder_path in folder_paths:
for file in listdir(folder_path):
df_dict['folder'].append(folder_path)
df_dict['file'].append(file)
df = pd.DataFrame(df_dict)
for file, group in df.groupby(df['file']):
df_temp = pd.DataFrame()
for folder in group['folder'].tolist():
df_temp = pd.concat([df_temp, pd.read_csv(f'{folder}/{file}')])
df_temp.to_csv(f'finalfolder/{file}')
this should do the trick, just make the list with your folders and it should do what you want.
import os
folders_list = ['folder1','folder2','folder3']
files1 = os.listdir(folders_list[0])
files2 = os.listdir(folders_list[1])
files3 = os.listdir(folders_list[2])
max_files_size =3
for i in range(max_files_size):
f1=False
f2=False
f3=False
try:
folder1_df = pd.read_csv(os.path.abspath("folder1")+"/"+files1[i])
f1=True
except:
pass
try:
folder2_df = pd.read_csv(os.path.abspath("folder2")+"/"+files2[i])
f2=True
except:
pass
try:
folder3_df = pd.read_csv(os.path.abspath("folder3")+"/"+files3[i])
f3=True
except:
pass
if f1 and f2 and f3:
final_df = pd.concat([folder1_df,folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f1 and f3:
final_df = pd.concat([folder1_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2 and f3:
final_df = pd.concat([folder2_df,folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1 and f2:
final_df = pd.concat([folder1_df,folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f1:
final_df = pd.concat([folder1_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files1[i], index=None)
print(final_df.shape)
elif f2:
final_df = pd.concat([folder2_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files2[i], index=None)
print(final_df.shape)
elif f3:
final_df = pd.concat([folder3_df])
final_df.to_csv(os.path.abspath("final_folder")+"/"+files3[i], index=None)
print(final_df.shape)
else:
print("No")
A bit complicated but it should work if you are working on windows. Provide all folders names in list. max_files_size is maximum number of files any folder can have. You must have final_folder in which you want to save new files.
Because all folders does not have same number of files Try block will handle that. If will check files with same names in all folders, concat them and save in new folder. os.path.abspath get absolute path to files
import os
import csv as cs
import pandas as pd
base = os.path.abspath('/home/hari/Documents/python/pandas/') #base
directory
where the program saved
print(os.path.join(base, 'dir1/df1.csv'))
q = os.path.join(base, 'dir1/df1.csv')
w = os.path.join(base, 'dir1/df4.csv')
e = os.path.join(base, 'dir1/df5.csv')
r = os.path.join(base, 'dir2/df1.csv')
t = os.path.join(base, 'dir2/df3.csv')
y = os.path.join(base, 'dir2/df4.csv')
u = os.path.join(base, 'dir3/df4.csv')
csv = [q, w, e, r, t, y, u]
fi = pd.concat(map(pd.read_csv, csv), ignore_index=True) #used map
function because of array
print(fi) #for testing
final = open(os.path.join(base, 'final/final.csv'), 'w', encoding='UTF8')
write = cs.writer(final) #passing the final file in csv writer
write.writerow(fi) #passing the concatenated csv's in writer
#make sure to change the url in base directory

Add folder name to exported file

I'm hoping someone can assist. I want to add the folder name to a file export so the exported filename is "combined_summary_of .xls" but can't seem to be able to add in the right reference name. The list of folders does work but stuck at the folder name.
import os
import glob
import pandas as pd
df_list = list() # list of dataframes
folder = r"D:/summary_tables/"
os.chdir(folder)
for root, dirs, files in os.walk(folder):
for folder in folder:
keyword = folder
os.chdir("D:/summary_tables/")
glob.glob("D:/summary_tables/"+ keyword + "/filtered*.xls")
#initialize a empty dataframe and append individual files
all_data = pd.DataFrame()
for f in glob.glob("D:/summary_tables/" +keyword + "/filtered*.xls"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
all_data.head()
#group all of the files together and sort
all_data2 = pd.concat([all_data]).groupby(['host_name_queried']).sum().reset_index()
all_data2 = all_data2.sort_values('Total_count', ascending=False)
all_data2.head(n=10)
all_data2['Total_nx_domain'] = all_data2['Total_nx_domain'].astype(float)
#send to xls
import openpyxl
all_data2.to_excel('D:/summary_tables/combined_summary_of_' + '.xls', index=False)
print ("file has been saved")
all_data

How to Import Multiple excel file in PandasDataframe

I cannot load multiple excel files from a directory in only one Dataframe.
I have tried two different ways and both do no work.
Gives me this error.
How can I solve the problem? It does find the files when creates the list, but than cannot open it in the Dataframe.
Any hints ?
import pandas as pd
import os
import glob
import xlrd
cwd = os.getcwd()
cwd
path = '/Users/giovanni/Desktop/news media'
files = os.listdir(path)
files
files_xls = [f for f in files if f[-3:] == 'lsx']
files_xls
df = pd.DataFrame()
for f in files_xls:
data = pd.read_excel(f)
df = df.append(data)
FileNotFoundError: [Errno 2] No such file or directory: 'NOV.xlsx'
Try this:
import os
import glob
path = '/Users/giovanni/Desktop/news media'
df = pd.DataFrame()
for file in glob.glob(os.path.join(path,'*.xlsx')):
data = pd.read_excel(file)
print(data)
df = df.append(data)
Replace your final loop with:
for f in files_xls:
full_path = os.path.join(path, f)
data = pd.read_excel(full_path)
df = df.append(data)

Python , get duplicates in 1st column of all csv files in a directory

import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.

Loop through directory and create data frame

I'm trying to create a data frame and then loop through a directory filled with csv files and add those to the data frame. I'm trying to use the following code:
df = []
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
df = pd.read_csv(fname)
Unfortunately I'm getting an error stating that "File CIN_2017 does not exist" (it does). Any insight into how to add all these csv files into a dataframe? There is a .DS_Store in there but everything else is just a csv. Thanks.
You can try another solution with glob for return file names, then loop in list comprehension and create list of DataFrames. last concate them to one big df:
import glob
files = glob.glob('files/*.csv')
df = pd.concat([pd.read_csv(fp) for fp in files], ignore_index=True)
It is same as:
import glob
files = glob.glob('files/*.csv')
dfs = []
for fp in files:
dfs.append(pd.read_csv(fp))
df = pd.concat(dfs, ignore_index=True)
import os
import pandas as pd
un_process_file = []
master_frame = pd.DataFrame(columns=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
for root, dirs, files in os.walk(os.getcwd()):
for file_path in files:
if file_path.endswith('.csv'):
try:
print file_path
file_name = os.path.join(root, file_path)
file_frames = pd.read_csv(file_name, skiprows=2,
usecols=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
master_frame = master_frame.append(file_frames)
except:
un_process_file.append(file_path)
master_frame = master_frame.rename(
columns={'item_sku': 'sku', 'external_product_id': 'asin', 'standard_price': 'price'})
master_frame = master_frame.drop_duplicates(subset='asin')
master_frame.to_csv('masterfile.txt', sep='\t')
if un_process_file:
print '\nUnable To Process these files\n'
for files in un_process_file:
print files
I have a similar problem. I made this solution. Modify columns name according to you need

Categories