Python Pandas ExcelWriter append to sheet creates a new sheet - python

I would I really appreciate some help.
I'm trying to use a loop to create sheets, and add data to those sheets for every loop. The position of my data is correct, however Panda ExcelWriter creates a new sheet instead of appending to the one created the first time the loop runs.
I'm a beginner, and right function is over form, so forgive me.
My code:
import pandas as pd
# initial files for dataframes
excel_file = 'output.xlsx'
setup_file = 'setup.xlsx'
# write to excel
output_filename = 'output_final.xlsx'
df = pd.read_excel(excel_file) # create dataframe of entire sheet
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_setup = pd.read_excel(setup_file)
df_setup.columns = df_setup.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_2 = pd.merge(df, df_setup) # Merge data with setup to have krymp size for each wire in dataframe
df_2['wirelabel'] = "'" + df_2['cable'] + "_" + df_2['function_code'] + "-" + df_2['terminal_strip'] + ":" + df_2[
'terminal'] # creates column for the wirelabel by appending columns with set delimiters. #TODO: delimiters to be by inputs.
df_2.sort_values(by=['switchboard']) # sort so we get proper order
switchboard_unique = df.switchboard.unique().tolist() # crate variable containing unique switchboards for printing to excel sheets
def createsheets(output_filename, sheetname, row_start, column_start, df_towrite):
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a') as writer:
df_towrite.to_excel(writer, sheet_name=sheetname, columns=['wirelabel'], startrow=row_start, startcol=column_start, index=False, header=False)
writer.save()
writer.close()
def sorter():
for s in switchboard_unique:
df_3 = df_2.loc[df_2['switchboard'] == s]
krymp_unique = df_3.krymp.unique().tolist()
krymp_unique.sort()
# print(krymp_unique)
column_start = 0
row_start = 0
for k in krymp_unique:
df_3.loc[df_3['krymp'] == k]
# print(k)
# print(s)
# print(df_3['wirelabel'])
createsheets(output_filename, s, row_start, column_start, df_3)
column_start = column_start + 1
sorter()
current behavior:
if sheetname is = sheet, then my script creates sheet1, sheet2, sheet3..etc.
pictureofcurrent
Wanted behavior
Create a sheet for each item in "df_3", and put data into columns according to the position calculated in column_start. The position in my code works, just goes to the wrong sheet.
pictureofwanted
I hope it's clear what im trying to accomplish, and all help is appriciated.
I tried all example codes i have sound regarding writing to excel.

I know my code is not a work of art, but I will update this post with the answer to my own question for the sake of completeness, and if anyone stumbles on this post.
It turns out i misunderstood the capabilities of the "append" function in Pandas "pd.ExcelWriter". It is not possible to append to a sheet already existing, the sheet will get overwritten though mode is set to 'a'.
Realizing this i changed my code to build a dataframe for the entire sheet (df_sheet), an then call the "createsheets" function in my code. The first version wrote my data column by column.
"Final" code:
import pandas as pd
# initial files for dataframes
excel_file = 'output.xlsx'
setup_file = 'setup.xlsx'
# write to excel
output_filename = 'output_final.xlsx'
column_name = 0
df = pd.read_excel(excel_file) # create dataframe of entire sheet
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_setup = pd.read_excel(setup_file)
df_setup.columns = df_setup.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_2 = pd.merge(df, df_setup) # Merge data with setup to have krymp size for each wire in dataframe
df_2['wirelabel'] = "'" + df_2['cable'] + "_" + df_2['function_code'] + "-" + df_2['terminal_strip'] + ":" + df_2[
'terminal'] # creates column for the wirelabel by appending columns with set delimiters. #TODO: delimiters to be by inputs.
df_2.sort_values(by=['switchboard']) # sort so we get proper order
switchboard_unique = df.switchboard.unique().tolist() # crate variable containing unique switchboards for printing to excel sheets
def createsheets(output_filename, sheetname, df_towrite):
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a') as writer:
df_towrite.to_excel(writer, sheet_name=sheetname, index=False, header=True)
def to_csv_file(output_filename, df_towrite):
df_towrite.to_csv(output_filename, mode='w', index=False)
def sorter():
for s in switchboard_unique:
df_3 = df_2.loc[df_2['switchboard'] == s]
krymp_unique = df_3.krymp.unique().tolist()
krymp_unique.sort()
column_start = 0
row_start = 0
df_sheet = pd.DataFrame([])
for k in krymp_unique:
df_5 = df_3.loc[df_3['krymp'] == k]
df_4 = df_5.filter(['wirelabel'])
column_name = "krymp " + str(k) + " Tavle: " + str(s)
df_4 = df_4.rename(columns={"wirelabel": column_name})
df_4 = df_4.reset_index(drop=True)
df_sheet = pd.concat([df_sheet, df_4], axis=1)
column_start = column_start + 1
row_start = row_start + len(df_5.index) + 1
createsheets(output_filename, s, df_sheet)
to_csv_file(s + ".csv", df_sheet)
sorter()
Thank you.

Related

Iterate over Excel sheets, clean them up and concatenate

The below code will iterate through all Sheets, change them and concatenate.
import pandas as pd
sheets_dict = pd.read_excel('Royalties Jan to Dec 21.xlsx', sheet_name=None)
all_sheets = []
for name, sheet in sheets_dict.items():
sheet['sheet'] = name
sheet = sheet.fillna('')
sheet.columns = (sheet.iloc[2] + ' ' + sheet.iloc[3])
sheet = sheet[sheet.iloc[:,0] == 'TOTAL']
all_sheets.append(sheet)
full_table = pd.concat(all_sheets)
full_table.reset_index(inplace=True, drop=True)
full_table.to_excel('output.xlsx')
However, when I execute the code, I get the following error:
InvalidIndexError: Reindexing only valid with uniquely valued Index objects
I have pinpointed the issue to the following line:
sheet.columns = (sheet.iloc[2] + ' ' + sheet.iloc[3])
This line is supposed to merge two rows together:
Would anyone know what am I doing wrong? Thank you.

Storing data from a for loop in excel python

Want to store all the data from the for loop in an excel file, currently only storing the last output:
import pandas
import openpyxl
outputFile = 'outputData.xlsx'
workbook = openpyxl.load_workbook(os.getcwd() + '/sourceData.xlsx')
sheet = workbook["Sheet1"]
for i in range(2, sheet.max_row + 1):
<I do some ops to copy the data>
data = pyperclip.paste() #Want this data to be stored in the outPut excel, there is different input for every cell, so there will also be different output
df = pd.DataFrame({'Address':[mapData]})
df2 = pd.DataFrame()
df2 = df2.append(df, ignore_index=True, sort=False)
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()
just move df2 first initialization out of loop:
...
# define df2 here, just once
df2 = pd.DataFrame()
for i in range(2, sheet.max_row + 1):
... # your operations
df = pd.DataFrame({'Address':[mapData]})
# append df2 immediately after generation of df
df2 = df2.append(df, ignore_index=True, sort=False)
# save as before
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()

Error with combining multiple workbook and sheets

I wondered if anyone could help. I am combining multiple excel files that have between 1-3 sheets. I want to combine these sheets into 3 dataframes and have done so using the below code:
all_workbook1 = pd.DataFrame()
all_workbook2 = pd.DataFrame()
all_workbook3 = pd.DataFrame()
for f in glob.glob("*.xlsx"):
dfworkbook1 = pd.read_excel(f, sheet_name="sheet1", usecols="B:AO")
dfworkbook1["Filename"] = "[" + os.path.basename(f) + "]"
all_workbook1 = all_workbook1.append(dfworkbook1,ignore_index=True)
dfworkbook2 = pd.read_excel(f, sheet_name="sheet2", usecols="B:AO")
dfworkbook2["Filename"] = "[" + os.path.basename(f) + "]"
all_workbook2 = all_workbook2.append(dfworkbook2,ignore_index=True)
dfworkbook3 = pd.read_excel(f, sheet_name="sheet3", usecols="B:AO")
dfworkbook3["Filename"] = "[" + os.path.basename(f) + "]"
all_workbook3 = all_workbook3.append(dfworkbook3,ignore_index=True)
When running this I can get the below error:
xlrd.biffh.XLRDError: No sheet named <'sheet3'>
I believe this is due to the fact that not all of my files have 'sheet3'. What is the best process to avoid this? I have tried to add code to the start that runs over the files and adds the missing sheets as a blank sheet but have been struggling with this.
Any help would be great.
Thanks,
Dan
Consider using a defined method that runs try/except to account for potentially missing sheets. Then call method within several list comprehensions for corresponding sheet data frame lists that are ultimately concatenated together:
def read_xl_data(file, sh):
try:
df = (pd.read_excel(file, sheet_name=sh, usecols="B:AO")
.assign(Filename = f"[{os.path.basename(file)}]"))
except:
df = pd.DataFrame()
return df
# LIST COMPREHENSIONS TO RETRIEVE SPECIFIC SHEETS
sheet1_dfs = [read_xl_data(f, "sheet1") for f in glob.glob("*.xlsx")]
sheet2_dfs = [read_xl_data(f, "sheet2") for f in glob.glob("*.xlsx")]
sheet3_dfs = [read_xl_data(f, "sheet3") for f in glob.glob("*.xlsx")]
# CONCAT CORRESPONDING SHEET DFS TOGETHER
all_workbook1 = pd.concat(sheet_1_dfs)
all_workbook2 = pd.concat(sheet_2_dfs)
all_workbook3 = pd.concat(sheet_3_dfs)

Merging multiple dataframes into single sheet top to bottom using loop

My python is rudimentary. What I want it to do is take the first dataframe, search for a unique number and create a new df in the same formatted template, the use the same unique number, search through the second df and create a new df pertinent to that unique number in the specified format, then merge all the looped data one top of each other.
This is the code
#function
def multiple_dfs(df_list, sheets, file_name, spaces):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
row = 0
for i in uniqueIR:
dftopi = df_out[df_out['Invoice Reference Number'] == i]
df2 = df_out_fin[df_out_fin['Invoice Reference Number'] == i]
df3 = df2.drop(columns = ['Invoice Reference Number'])
for dataframe in df_list:
dataframe.to_excel(writer,sheet_name=sheets,startrow=row , startcol=0, index = False)
row = row + len(dataframe.index) + spaces
writer.save()
# list of dataframes
dfs = [dftopi,df3]
# run function
multiple_dfs(dfs, 'Validation', 'test1.xlsx', 1)
This is what I want:
table output
Figured out a solution if anyone in the future is wonder:
writer = pd.ExcelWriter('test3.xlsx', engine = 'xlsxwriter')
dflist = []
for i in uniqueIR:
dftopi = df_out[df_out['Invoice Reference Number'] == i]
df2 = df_out_fin[df_out_fin['Invoice Reference Number'] == i]
df3 = df2.drop(columns = ['Invoice Reference Number'])
dftopi.to_excel(writer, sheet_name = 'Top Half' + str(i), index = False)
df3.to_excel(writer, sheet_name = 'Bottom Half' + str(i), index = False)
dflist.append(dftopi)
dflist.append(df3)
writer.save()
def multiple_dfs(df_list, sheets, file_name, spaces):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
row = 0
for dataframe in df_list:
dataframe.to_excel(writer,sheet_name=sheets,startrow=row , startcol=0, index = False)
row = row + len(dataframe.index) + spaces
writer.save()
multiple_dfs(dflist, 'Validation', 'test4.xlsx', 1)

Combining Excel worksheets over multiple loops

I've got a number of Excel workbooks, each with multiple worksheets, that I'd like to combine.
I've set up two sets of loops (one while, one for) to read in rows for each sheet in a given workbook and then do the same for all workbooks.
I tried to do it on a subset of these, and it appears to work until I try to combine the two sets using the pd.concat function. Error given is
TypeError: first argument must be an iterable of pandas objects, you
passed an object of type "DataFrame"
Any idea what I'm doing incorrectly?
import pandas as pd
d = 2013
numberOfSheets = 5
while d < 2015:
#print(str(d) + ' beginning')
f ='H:/MyDocuments/Z Project Work/scriptTest ' + str(d) + '.xlsx'
for i in range(1,numberOfSheets+1):
data = pd.read_excel(f, sheetname = 'Table '+str(i), header=None)
print(i)
df.append(data)
print(str(d) + ' complete')
print(df)
d += 1
df = pd.concat(df)
print(df)
final = "H:/MyDocuments/Z Project Work/mergedfile.xlsx"
df.to_excel(final)
As the error says, pd.concat() requires an iterable, like a list: pd.concat([df1, df2]) will concatenate df1 and df2 along the default axis of 0, which means df2 is appended to the bottom of df1.
Two issues need fixing:
The for loop refers to df before assigning anything to it.
The variable df is overwritten with each iteration of the for loop.
One workaround is to create an empty list of DataFrames before the loops, then append DataFrames to that list, and finally concatenate all the DataFrames in that list. Something like this:
import pandas as pd
d = 2013
numberOfSheets = 5
dfs = []
while d < 2015:
#print(str(d) + ' beginning')
f ='H:/MyDocuments/Z Project Work/scriptTest ' + str(d) + '.xlsx'
for i in range(1, numberOfSheets + 1):
data = pd.read_excel(f, sheetname='Table ' + str(i), header=None)
print(i)
dfs.append(data)
print(str(d) + ' complete')
print(df)
d += 1
# ignore_index=True gives the result a default IntegerIndex
# starting from 0
df_final = pd.concat(dfs, ignore_index=True)
print(df_final)
final_path = "H:/MyDocuments/Z Project Work/mergedfile.xlsx"
df_final.to_excel(final_path)
Since I can't comment, I'll leave this as an answer: you can speed up this code by opening the file once then parsing the workbook to get each sheet. Should save a second or two off each iteration, since opening the Excel file takes the longest. Here's some code that might help.
Note: setting sheet_name=None will return ALL the sheets in the workbook:
dfs = {<sheetname1>: <DataFrame1>, <sheetname2>: <DataFrame2>, etc.}
Here's the code:
xl = pd.ExcelFile(fpath)
dfs = xl.parse(sheetname=None, header=None)
for i, df in enumerate(dfs):
<do stuff with each, if you want>
print('Sheet {0} looks like:\n{1}'.format(i+1, df))
Thank you, both. I accepted the answer that addressed the specific question, but was able to use the second answer and some additional googling thereafter (eg, glob) to amend the original code, and automate more fully independent of number of workbooks or worksheets.
Final version of the above now below:
import pandas as pd
import glob
#import numpy as np
#import os, collections, csv
#from os.path import basename
fpath = "H:/MyDocuments/Z Project Work/"
dfs = []
files = glob.glob(fpath+'*.xlsx')
for f in files:
xl = pd.ExcelFile(f)
xls = xl.parse(sheetname=None, header=0)
for i, df in enumerate(xls):
print(i)
dfs.append(xls[df])
print(f+ ' complete')
df_final = pd.concat(dfs, ignore_index=True)
final = "H:/MyDocuments/Z Project Work/mergedfile.xlsx"
df_final.to_excel(final)

Categories