Loop through multiple CSV files and run a script - python

I have a script which pulls in data from a csv file, does some manipulations to it and creates an output excel file. But, its a tedious process as I need to do it for multiple files.
Question: Is there a way for me to run this script across multiple csv files together and create a separate excel file output for each input file?
I'm not sure what to try out here. I've read that I need to use a module called glob but I'm not sure how to go about it.
This script works for a single file:
# Import libraries
import pandas as pd
import xlsxwriter
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
INPUT_FILE = 'rawData.csv'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
OUTPUT_FILE = 'rawDataOutput.xlsx'
# Get data
df = pd.read_csv(INPUT_PATH + INPUT_FILE)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + OUTPUT_FILE, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
I've never tried anything like this before and I would appreciate your help trying to figure this out

You can use Python's glob.glob() to get all of the CSV files from a given folder. For each filename that is returned, you could derive a suitable output filename. The file processing could be moved into a function as follows:
# Import libraries
import pandas as pd
import xlsxwriter
import glob
import os
def process_csv(input_filename, output_filename):
# Get data
df = pd.read_csv(input_filename)
# Clean data
cleanedData = df[['State','Campaigns','Type','Start date','Impressions','Clicks','Spend(INR)',
'Orders','Sales(INR)','NTB orders','NTB sales']]
cleanedData = cleanedData[cleanedData['Impressions'] != 0].sort_values('Impressions',
ascending= False).reset_index()
cleanedData.loc['Total'] = cleanedData.select_dtypes(pd.np.number).sum()
cleanedData['CTR(%)'] = (cleanedData['Clicks'] /
cleanedData['Impressions']).astype(float).map("{:.2%}".format)
cleanedData['CPC(INR)'] = (cleanedData['Spend(INR)'] / cleanedData['Clicks'])
cleanedData['ACOS(%)'] = (cleanedData['Spend(INR)'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData['% of orders NTB'] = (cleanedData['NTB orders'] /
cleanedData['Orders']).astype(float).map("{:.2%}".format)
cleanedData['% of sales NTB'] = (cleanedData['NTB sales'] /
cleanedData['Sales(INR)']).astype(float).map("{:.2%}".format)
cleanedData = cleanedData[['State','Campaigns','Type','Start date','Impressions','Clicks','CTR(%)',
'Spend(INR)','CPC(INR)','Orders','Sales(INR)','ACOS(%)',
'NTB orders','% of orders NTB','NTB sales','% of sales NTB']]
# Create summary
summaryData = cleanedData.groupby(['Type'])[['Spend(INR)','Sales(INR)']].agg('sum')
summaryData.loc['Overall Snapshot'] = summaryData.select_dtypes(pd.np.number).sum()
summaryData['ROI'] = summaryData['Sales(INR)'] / summaryData['Spend(INR)']
# Push to excel
writer = pd.ExcelWriter(output_filename, engine='xlsxwriter')
summaryData.to_excel(writer, sheet_name='Summary')
cleanedData.to_excel(writer, sheet_name='Overall Report')
writer.save()
# Set system paths
INPUT_PATH = 'SystemPath//Downloads//'
OUTPUT_PATH = 'SystemPath//Downloads//Output//'
for csv_filename in glob.glob(os.path.join(INPUT_PATH, "*.csv")):
name, ext = os.path.splitext(os.path.basename(csv_filename))
# Create an output filename based on the input filename
output_filename = os.path.join(OUTPUT_PATH, f"{name}Output.xlsx")
process_csv(csv_filename, output_filename)
os.path.join() can be used as a safer way to join file paths together.

Something like:
import os
import glob
import pandas as pd
os.chdir(r'path\to\folder') #changes folder path to working dir
filelist=glob.glob('*.csv') #creates a list of all csv files
for file in filelist: #loops through the files
df=pd.read_csv(file,...)
#Do something and create a final_df
final_df.to_excel(file[:-4],+'_output.xlsx',index=False) #excel with same name+ouput

you can run this scrip inside a for loop:
for file in os.listdir(INPUT_PATH):
if file.endswith('.csv') or file.endswith('.CSV'):
INPUT_FILE = INPUT_PATH + '/' + file
OUTPUT_FILE = INPUT_PATH + '/Outputs/' + file.[:-4] + 'xlsx'

try this:
import glob
files = glob.glob(INPUT_PATH + "*.csv")
for file in files:
# Get data
df = pd.read_csv(file)
# Clean data
#your cleaning code
# Push to excel
writer = pd.ExcelWriter(OUTPUT_PATH + file.split("/")[-1].replace(".csv","_OUTPUT.xlxs", engine='xlsxwriter')

Related

Outputting or reading columns in a different order using pandas/python

I have a Python script that reads a CSV file and outputs it to HTML.
I would like to change the order of the columns once written to the HTML file, but I don't know if I should do this on read or write or what function to use, here is my code so far.
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['OrderNo','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
html_table = csv.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)
Any help, greatly appreciated.
Thanks.
I've been looking at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_html.html, but can't quite find out how to re-order the columns.
Edit,
Changes to get the solution are,
import pandas as pd
import os
import shutil
import glob
#paths
HTMLPATH="C:/NMS4/QUE/HTML/"
QUEPATH="C:/NMS4/QUE/"
#create the directory for holding the HTML files if it doesn't exist
isExist = os.path.exists(HTMLPATH)
if not isExist:
os.mkdir(HTMLPATH, 0o777)
#convert .txt file to a html file in the HTML folder
#python can't convert an empty file so only convert if file size in not 0
for quefile in glob.iglob('C:/NMS4/QUE/*.txt'):
if os.path.getsize(quefile) != 0:
csv = pd.read_csv(quefile, header=None, usecols=[0,3,4,15,34,43,44,129], names=['Order No','Req Qty','Planned Start','Resource','Op','Part','Desc','Qty Recd'])
cols = list(csv.columns)
a, b, c, d, e, f, g = cols.index('Order No'), cols.index('Req Qty'), cols.index('Planned Start'), cols.index('Resource'), cols.index('Op'), cols.index('Part'), cols.index('Desc')
cols[a], cols[b], cols[c], cols[d], cols[e], cols[f], cols[g] = cols[a], cols[e], cols[f], cols[g], cols[c], cols[b], cols[d]
df = csv[cols]
html_table = df.to_html()
f = open(quefile + '.html', 'w')
f.write(html_table)
f.close()
shutil.move(quefile + ".HTML", HTMLPATH)

Add folder name to exported file

I'm hoping someone can assist. I want to add the folder name to a file export so the exported filename is "combined_summary_of .xls" but can't seem to be able to add in the right reference name. The list of folders does work but stuck at the folder name.
import os
import glob
import pandas as pd
df_list = list() # list of dataframes
folder = r"D:/summary_tables/"
os.chdir(folder)
for root, dirs, files in os.walk(folder):
for folder in folder:
keyword = folder
os.chdir("D:/summary_tables/")
glob.glob("D:/summary_tables/"+ keyword + "/filtered*.xls")
#initialize a empty dataframe and append individual files
all_data = pd.DataFrame()
for f in glob.glob("D:/summary_tables/" +keyword + "/filtered*.xls"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
all_data.head()
#group all of the files together and sort
all_data2 = pd.concat([all_data]).groupby(['host_name_queried']).sum().reset_index()
all_data2 = all_data2.sort_values('Total_count', ascending=False)
all_data2.head(n=10)
all_data2['Total_nx_domain'] = all_data2['Total_nx_domain'].astype(float)
#send to xls
import openpyxl
all_data2.to_excel('D:/summary_tables/combined_summary_of_' + '.xls', index=False)
print ("file has been saved")
all_data

Reading multiple excel files and writting it to multiple excel files in python

I have written code where it is reading excel file and then after processing required function I want to write it to Excel file . Now I have done this for one excel file . and now my question is when I want to do it for multiple excel file that is reading multiple excel file and then output should be also in multiple excel file how will I apply for loop here so I get separate output excel file for each input file
Following is my code
from ParallelP import *
import time,json
import pandas as pd
if __name__ == '__main__':
__ip__ = "ip/"
__op__ = "op/"
__excel_file_name__ = __ip__ + '80chars.xlsx'
__prediction_op__ = __op__ + basename(__excel_file_name__) + "_processed.xlsx"
df = pd.read_excel(__excel_file_name__)
start_time = time.time()
df_preprocessed = run(df)
print("Time Needed to execute all data is {0} seconds".format((time.time() - start_time)))
print("Done...")
df_preprocessed.to_excel(__prediction_op__)
I tried to stick to your example and just expand it as I would do it. The below example is untested and does not mean that it is the best way to do it!
from ParallelP import *
import time,json
import pandas as pd
import os
from pathlib import Path # Handles directory paths -> less error prone than manually sticking together paths
if __name__ == '__main__':
__ip__ = "ip/"
__op__ = "op/"
# Get a list of all excel files in a given directory
excel_file_list = [f for f in os.listdir(__ip__) if f.endswith('.xlsx')]
# Loop over the list and process each excel file seperately
for excel_file in excel_file_list:
excel_file_path = Path(__ip__, excel_file) # Create the file path
df = pd.read_excel(str(excel_file)) # Read the excel file to data frame
start_time = time.time()
df_preprocessed = run(df) # Run your routine
print("Time Needed to execute all data is {0} seconds".format((time.time() - start_time)))
print("Done...")
# Create the output file name
prediction_output_file_name = '{}__processed.xlsx'.format(str(excel_file_path.resolve().stem))
# Create the output file path
prediction_output_file_path = str(Path(__op__, prediction_output_file_name))
# Write the output to the excel file
df_preprocessed.to_excel(prediction_output_file_path)
Sidenote: I have to mention that your variable names feel like a misuse of the __ . These 'dunder' functions are special and indicate that they have a meaning for python (see for example here). Please, just name your variables input_dir and output_dir instead of __ip__ and __op__, respectively.
I do have some code i wrote. Maybe you can alter this for your requirements.
# This is where your input file should be
in_folder = 'input/xls/file/folder'
# This will be your output folder
out_folder = 'output/xls/file/folder'
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
file_exist = True
str_file = os.path.join(in_folder, xlfile)
#work_book = load_workbook(filename=str_file)
#work_sheet = work_book['qa']
#Do ur work hear with excel
#out_Path = os.path.join(out_folder,)
#and output it to the out_Path
if not file_exist:
print('cannot find any valid excel file in the folder ' + in_folder)

Taking Same Worksheet from a Folder of xlsm Files with Python

I'm new to pandas/python and Ive come up with the following code to extract data from a specific part of a worksheet.
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
I have the same worksheet (summary results) across a library of 60 xlsm files and I'm having a hard time figuring out how to iterate this across the entire folder of files. I also want change this from extracting specific rows to taking the entire "Summary" worksheet, pasting it to the new file and naming the worksheet by its filename ("Experiment_A") when pasted to the new excel file. Any advice?
I was having hard time to read your code to understand that what you want to do finally. So it is just an advice not a solution. You can iterate through all files in the folder using os then read the files in to one dataframe then save the single big data frame in to csv. I usually avoid excel but I guess you need the excel conversion. In the example below I have read all txt file from a directory put them in to dataframe list then store the big data frame as json. You can also store it as excel/csv.
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()

Python , get duplicates in 1st column of all csv files in a directory

import pandas as pd
import glob
dataset = pd.read_csv('masterfeedproduction-EURNA_2016-06-27.csv',sep =
',',delimiter = None) # select 1 file in the directory
datasets_cols = ['transactionID','gvkey','companyName']
df= dataset.transactionID
df.shape
df.loc[df.duplicated()]
returns the duplicates in the selected file. displays row number and transactionID. so this is correct.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
df_result = df.loc[df.duplicated()]
for file in file_list:
return(df_result)
here I am stuck.
target_directory = r'C:\Users\nikol\Downloads\fullDailyDeltas\fullDailyDeltas'
file_list = glob.glob(target_directory + "/*.csv")
for file in file_list:
dataset = pd.read_csv(file)
df = dataset.transactionID
duplicated = df.loc[df.duplicated()]
if duplicated.empty == False:
print(file)
print(duplicated)
Have a look at the glob module.
import pandas as pd
import glob
def your_function(file):
# put your df processing logic here
return df_result
Step 1 - Create list of files in directory
target_directory = r'Path/to/your/dir'
file_list = glob.glob(target_directory + "/*.csv")
# Include slash or it will search in the wrong directory!!
Step 2 - Loop through files in list
for file in file_list: # Loop files
df_result = your_function(file) # Put your logic into a separate function
new_filename = file.replace('.csv', '_processed.csv')
df_result.to_csv(new_filename, index = False)
Comment
In case you would have included your code showing your attempts to do this yourself, your question was answered within seconds.

Categories