Brand new to Python and could use some help importing multiple Excel files to separate Pandas dataframes. I have successfully implemented the following code, but of course it imports everything into one frame. I would like to import them into df1, df2, df3, df4, df5, etc.
Anything helps, thank you!
import pandas as pd
import glob
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df = pd.DataFrame()
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df = df.append(data)
print(df)
print(number_of_files)
get_files()
The easiest way to do that is to use a list. Each element of the list is a dataframe
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = []
for f in filenames:
data = pd.read_excel(f, 'Sheet1')
df_list.append(data)
print(df_list)
print(number_of_files)
return df_list
get_files()
You can then access your dataframes with df_list[0], df_list[1]...
Just as another option by Jezrael answer here https://stackoverflow.com/a/52074347/13160821 but modified for your code.
from os.path import basename
def get_files():
directory_path = input('Enter directory path: ')
filenames = glob.glob(directory_path + '/*.xlsx')
number_of_files = len(filenames)
df_list = {basename(f) : pd.read_excel(f, 'Sheet1') for f in filenames}
print(number_of_files)
return df_list
get_files()
Which can then be accessed by the filename eg. dfs['file_name1.xlsx'] or dfs['some_file.xlsx']. You can also do things like splitext to remove the xlsx from the key or use just part of the filename.
I'm hoping someone can assist. I want to add the folder name to a file export so the exported filename is "combined_summary_of .xls" but can't seem to be able to add in the right reference name. The list of folders does work but stuck at the folder name.
import os
import glob
import pandas as pd
df_list = list() # list of dataframes
folder = r"D:/summary_tables/"
os.chdir(folder)
for root, dirs, files in os.walk(folder):
for folder in folder:
keyword = folder
os.chdir("D:/summary_tables/")
glob.glob("D:/summary_tables/"+ keyword + "/filtered*.xls")
#initialize a empty dataframe and append individual files
all_data = pd.DataFrame()
for f in glob.glob("D:/summary_tables/" +keyword + "/filtered*.xls"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
all_data.head()
#group all of the files together and sort
all_data2 = pd.concat([all_data]).groupby(['host_name_queried']).sum().reset_index()
all_data2 = all_data2.sort_values('Total_count', ascending=False)
all_data2.head(n=10)
all_data2['Total_nx_domain'] = all_data2['Total_nx_domain'].astype(float)
#send to xls
import openpyxl
all_data2.to_excel('D:/summary_tables/combined_summary_of_' + '.xls', index=False)
print ("file has been saved")
all_data
I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out
You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.
Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput
you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'
try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')
import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.
I'm trying to create a data frame and then loop through a directory filled with csv files and add those to the data frame. I'm trying to use the following code:
df = []
for dirName, subdirList, fileList in os.walk(rootDir):
for fname in fileList:
df = pd.read_csv(fname)
Unfortunately I'm getting an error stating that "File CIN_2017 does not exist" (it does). Any insight into how to add all these csv files into a dataframe? There is a .DS_Store in there but everything else is just a csv. Thanks.
You can try another solution with glob for return file names, then loop in list comprehension and create list of DataFrames. last concate them to one big df:
import glob
files = glob.glob('files/*.csv')
df = pd.concat([pd.read_csv(fp) for fp in files], ignore_index=True)
It is same as:
import glob
files = glob.glob('files/*.csv')
dfs = []
for fp in files:
dfs.append(pd.read_csv(fp))
df = pd.concat(dfs, ignore_index=True)
import os
import pandas as pd
un_process_file = []
master_frame = pd.DataFrame(columns=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
for root, dirs, files in os.walk(os.getcwd()):
for file_path in files:
if file_path.endswith('.csv'):
try:
print file_path
file_name = os.path.join(root, file_path)
file_frames = pd.read_csv(file_name, skiprows=2,
usecols=['item_sku', 'external_product_id', 'standard_price', 'quantity'])
master_frame = master_frame.append(file_frames)
except:
un_process_file.append(file_path)
master_frame = master_frame.rename(
columns={'item_sku': 'sku', 'external_product_id': 'asin', 'standard_price': 'price'})
master_frame = master_frame.drop_duplicates(subset='asin')
master_frame.to_csv('masterfile.txt', sep='\t')
if un_process_file:
print '\nUnable To Process these files\n'
for files in un_process_file:
print files
I have a similar problem. I made this solution. Modify columns name according to you need