Merging multiple dataframes into single sheet top to bottom using loop - python

My python is rudimentary. What I want it to do is take the first dataframe, search for a unique number and create a new df in the same formatted template, the use the same unique number, search through the second df and create a new df pertinent to that unique number in the specified format, then merge all the looped data one top of each other.
This is the code
#function
def multiple_dfs(df_list, sheets, file_name, spaces):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
row = 0
for i in uniqueIR:
dftopi = df_out[df_out['Invoice Reference Number'] == i]
df2 = df_out_fin[df_out_fin['Invoice Reference Number'] == i]
df3 = df2.drop(columns = ['Invoice Reference Number'])
for dataframe in df_list:
dataframe.to_excel(writer,sheet_name=sheets,startrow=row , startcol=0, index = False)
row = row + len(dataframe.index) + spaces
writer.save()
# list of dataframes
dfs = [dftopi,df3]
# run function
multiple_dfs(dfs, 'Validation', 'test1.xlsx', 1)
This is what I want:
table output

Figured out a solution if anyone in the future is wonder:
writer = pd.ExcelWriter('test3.xlsx', engine = 'xlsxwriter')
dflist = []
for i in uniqueIR:
dftopi = df_out[df_out['Invoice Reference Number'] == i]
df2 = df_out_fin[df_out_fin['Invoice Reference Number'] == i]
df3 = df2.drop(columns = ['Invoice Reference Number'])
dftopi.to_excel(writer, sheet_name = 'Top Half' + str(i), index = False)
df3.to_excel(writer, sheet_name = 'Bottom Half' + str(i), index = False)
dflist.append(dftopi)
dflist.append(df3)
writer.save()
def multiple_dfs(df_list, sheets, file_name, spaces):
writer = pd.ExcelWriter(file_name,engine='xlsxwriter')
row = 0
for dataframe in df_list:
dataframe.to_excel(writer,sheet_name=sheets,startrow=row , startcol=0, index = False)
row = row + len(dataframe.index) + spaces
writer.save()
multiple_dfs(dflist, 'Validation', 'test4.xlsx', 1)

Related

Storing data from a for loop in excel python

Want to store all the data from the for loop in an excel file, currently only storing the last output:
import pandas
import openpyxl
outputFile = 'outputData.xlsx'
workbook = openpyxl.load_workbook(os.getcwd() + '/sourceData.xlsx')
sheet = workbook["Sheet1"]
for i in range(2, sheet.max_row + 1):
<I do some ops to copy the data>
data = pyperclip.paste() #Want this data to be stored in the outPut excel, there is different input for every cell, so there will also be different output
df = pd.DataFrame({'Address':[mapData]})
df2 = pd.DataFrame()
df2 = df2.append(df, ignore_index=True, sort=False)
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()
just move df2 first initialization out of loop:
...
# define df2 here, just once
df2 = pd.DataFrame()
for i in range(2, sheet.max_row + 1):
... # your operations
df = pd.DataFrame({'Address':[mapData]})
# append df2 immediately after generation of df
df2 = df2.append(df, ignore_index=True, sort=False)
# save as before
writer = ExcelWriter(outputFile)
df2.to_excel(writer,'Sheet1',index=False)
writer.save()

Comparison of 2 different excel files using python

I have 2 different excel files with different number of rows and columns. I have to compare the amounts in both the excel sheets based on the unique ids and if there is any change in the value then I have to fetch those results and write the rows in a new excel file.Also If there is any new entry in 2nd excel then also need to copy the data into new excel. number of rows are different in both the files. I tried following approach but its not working and returning TypeError: 'Book' object is not subscriptable for the and condition in the if statement and if I am only iterating the rows without checking the same indexes then it is returning rows missing in the results
from itertools import zip_longest
import xlrd
rb1 = xlrd.open_workbook('./first_file1.xlsx')
rb2 = xlrd.open_workbook('./other_file1.xlsx')
sheet1 = rb1.sheet_by_index(0)
sheet2 = rb2``.sheet_by_index(0)
for rownum in range(max(sheet1.nrows, sheet2.nrows)):
if (rownum < sheet2.nrows) and (rb1[0] == rb2[0]):
row_rb1 = sheet1.row_values(rownum)
row_rb2 = sheet2.row_values(rownum)
for colnum, (c1, c2) in enumerate(zip_longest(row_rb1, row_rb2)):
if c1 != c2:
print ("Cell {}{} {} != {}".format(rownum+1, xlrd.formula.colname(colnum), c1, c2))
else:
print ("Row {} missing".format(rownum+1))
You can try this:
rb1 = xlrd.open_workbook('./first_file1.xlsx')
rb2 = xlrd.open_workbook('./other_file1.xlsx')
sheet1 = rb1.sheet_by_index(0)
sheet2 = rb2.sheet_by_index(0)
new_df = []
for i, rownum_sheet2 in enumerate(range(sheet2.nrows)): #go through the (possible longer) sheet2
row_rb2 = sheet2.row_values(rownum_sheet2)
for rownum_sheet1 in range(sheet1.nrows): #go through sheet1 and check for same id
row_rb1 = sheet1.row_values(rownum_sheet1)
if row_rb1[0] == row_rb2[0]: #if the row with the same id is not equal: append to new df
if row_rb1 != row_rb2:
new_df.append(row_rb2)
if i >= sheet1.nrows: #if there are extra rows, append to new df
new_df.append(row_rb2)
#write new df to new excel-file
New Code:
df1_1 = pd.read_table('.../first_file.txt', sep = '/t')
df1_1.to_excel('filename1.xlsx')
df_first_file = pd.concat([df1_1['Column'].str.split(' ',expand=True)],axis=1)
df_new1 = df_first_file.to_excel('first_file1.xlsx')
df1 = pd.read_table('.../otherfile.txt', sep = '/t')
df1.to_excel('filename2.xlsx')
df_otherfile = pd.concat([df1['Column'].str.split(' ',expand=True)],axis=1)
df_new2 = df_otherfile.to_excel('other_file1.xlsx')
new_df = []
for i, rownum_sheet2 in enumerate(range(df_new2.nrows)): #go through the (possible longer) sheet2
row_rb2 = df_new2.row_values(rownum_sheet2)
for rownum_sheet1 in range(df_new1.nrows): #go through sheet1 and check for same id
row_rb1 = df_new1.row_values(rownum_sheet1)
if row_rb1[0] == row_rb2[0]: #if the row with the same id is not equal: append to new df
if row_rb1 != row_rb2:
new_df.append(row_rb2)
if i >= df_new1.nrows: #if there are extra rows, append to new df
new_df2 = new_df.append(row_rb2)
print (new_df2)
new_df.to_excel('final_filename.xlsx')

Python Pandas ExcelWriter append to sheet creates a new sheet

I would I really appreciate some help.
I'm trying to use a loop to create sheets, and add data to those sheets for every loop. The position of my data is correct, however Panda ExcelWriter creates a new sheet instead of appending to the one created the first time the loop runs.
I'm a beginner, and right function is over form, so forgive me.
My code:
import pandas as pd
# initial files for dataframes
excel_file = 'output.xlsx'
setup_file = 'setup.xlsx'
# write to excel
output_filename = 'output_final.xlsx'
df = pd.read_excel(excel_file) # create dataframe of entire sheet
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_setup = pd.read_excel(setup_file)
df_setup.columns = df_setup.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_2 = pd.merge(df, df_setup) # Merge data with setup to have krymp size for each wire in dataframe
df_2['wirelabel'] = "'" + df_2['cable'] + "_" + df_2['function_code'] + "-" + df_2['terminal_strip'] + ":" + df_2[
'terminal'] # creates column for the wirelabel by appending columns with set delimiters. #TODO: delimiters to be by inputs.
df_2.sort_values(by=['switchboard']) # sort so we get proper order
switchboard_unique = df.switchboard.unique().tolist() # crate variable containing unique switchboards for printing to excel sheets
def createsheets(output_filename, sheetname, row_start, column_start, df_towrite):
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a') as writer:
df_towrite.to_excel(writer, sheet_name=sheetname, columns=['wirelabel'], startrow=row_start, startcol=column_start, index=False, header=False)
writer.save()
writer.close()
def sorter():
for s in switchboard_unique:
df_3 = df_2.loc[df_2['switchboard'] == s]
krymp_unique = df_3.krymp.unique().tolist()
krymp_unique.sort()
# print(krymp_unique)
column_start = 0
row_start = 0
for k in krymp_unique:
df_3.loc[df_3['krymp'] == k]
# print(k)
# print(s)
# print(df_3['wirelabel'])
createsheets(output_filename, s, row_start, column_start, df_3)
column_start = column_start + 1
sorter()
current behavior:
if sheetname is = sheet, then my script creates sheet1, sheet2, sheet3..etc.
pictureofcurrent
Wanted behavior
Create a sheet for each item in "df_3", and put data into columns according to the position calculated in column_start. The position in my code works, just goes to the wrong sheet.
pictureofwanted
I hope it's clear what im trying to accomplish, and all help is appriciated.
I tried all example codes i have sound regarding writing to excel.
I know my code is not a work of art, but I will update this post with the answer to my own question for the sake of completeness, and if anyone stumbles on this post.
It turns out i misunderstood the capabilities of the "append" function in Pandas "pd.ExcelWriter". It is not possible to append to a sheet already existing, the sheet will get overwritten though mode is set to 'a'.
Realizing this i changed my code to build a dataframe for the entire sheet (df_sheet), an then call the "createsheets" function in my code. The first version wrote my data column by column.
"Final" code:
import pandas as pd
# initial files for dataframes
excel_file = 'output.xlsx'
setup_file = 'setup.xlsx'
# write to excel
output_filename = 'output_final.xlsx'
column_name = 0
df = pd.read_excel(excel_file) # create dataframe of entire sheet
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_setup = pd.read_excel(setup_file)
df_setup.columns = df_setup.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')',
'') # clean dataframe titles
df_2 = pd.merge(df, df_setup) # Merge data with setup to have krymp size for each wire in dataframe
df_2['wirelabel'] = "'" + df_2['cable'] + "_" + df_2['function_code'] + "-" + df_2['terminal_strip'] + ":" + df_2[
'terminal'] # creates column for the wirelabel by appending columns with set delimiters. #TODO: delimiters to be by inputs.
df_2.sort_values(by=['switchboard']) # sort so we get proper order
switchboard_unique = df.switchboard.unique().tolist() # crate variable containing unique switchboards for printing to excel sheets
def createsheets(output_filename, sheetname, df_towrite):
with pd.ExcelWriter(output_filename, engine='openpyxl', mode='a') as writer:
df_towrite.to_excel(writer, sheet_name=sheetname, index=False, header=True)
def to_csv_file(output_filename, df_towrite):
df_towrite.to_csv(output_filename, mode='w', index=False)
def sorter():
for s in switchboard_unique:
df_3 = df_2.loc[df_2['switchboard'] == s]
krymp_unique = df_3.krymp.unique().tolist()
krymp_unique.sort()
column_start = 0
row_start = 0
df_sheet = pd.DataFrame([])
for k in krymp_unique:
df_5 = df_3.loc[df_3['krymp'] == k]
df_4 = df_5.filter(['wirelabel'])
column_name = "krymp " + str(k) + " Tavle: " + str(s)
df_4 = df_4.rename(columns={"wirelabel": column_name})
df_4 = df_4.reset_index(drop=True)
df_sheet = pd.concat([df_sheet, df_4], axis=1)
column_start = column_start + 1
row_start = row_start + len(df_5.index) + 1
createsheets(output_filename, s, df_sheet)
to_csv_file(s + ".csv", df_sheet)
sorter()
Thank you.

Python, iterating through an excel spreadsheet

I need to write the program that loops through the columns of data. Resetting the variable based on the cell value and each column representing a variable.
The variables in the exercise are dependent on these values that are being looped through.
How can I loop through the rows with each iteration of the loop increasing the value by 1?
df=pd.DataFrame(r'C:/Users/user.name/Desktop/P_list.xlsx',sheet_name = 'Sheet1')
for i in range(0,5000):
df2 = pd.read_excel(r'C:/Users/user.name/Desktop/P_list.xlsx',sheet_name = 'Sheet1'), index = list(range(i,5000,1), columns=list(range(0)))
df3 = pd.read_excel(r'C:/Users/user.name/Desktop/P_list.xlsx',sheet_name = 'Sheet1'), index = list(range(i,5000,1), columns=list(range(1)))
df4 = pd.read_excel(r'C:/Users/user.name/Desktop/P_list.xlsx',sheet_name = 'Sheet1'), index = list(range(i,5000,1), columns=list(range(2)))
df5 = pd.read_excel(r'C:/Users/user.name/Desktop/P_list.xlsx',sheet_name = 'Sheet1'), index = list(range(i,5000,1), columns=list(range(3)))
firstname = df2
lastname = df3
address = df4
number= df5
#performed exercise
I have tried this on Jupyter. This is needed to load the Excel to df:
import numpy as np
import pandas as pd
import xlrd
df = pd.read_excel('Sample.xlsx', sheet_name = 0)
Then looping towards the column names is like this:
for col in df.columns:
print(col)
And looping towards the data is this:
for col in df.columns:
print("NEW ROW ----------")
for val in df[col]:
print (val)
This is the printed data:
Another way to do it is to loop through the columns and the rows:
columns = len(df.columns)
rows = len(df)
for column in range(columns):
print("-----------")
for row in range(rows):
print(df.loc[row][column])

For loop with enumerate iterates 4 times, but only 1 dataframe is written to Excel instead of 4 dataframes

I have been struggling with this code all day. During each run of the loop, a table is read from a different MS Word file. The table is copied to a dataframe and then it is copied to a row in an Excel file.
With each subsequent run of the for-loop, the Excel row is incremented so the new dataframe can be written to a new row, but after the file executes only one row is showing a dataframe.
When I print(tfile), I get the following .. ('CIV-ASCS-016_TRS.docx', 'CIV-ASCS-018_TRS .docx', 'CIV-ASCS-020_TRS.docx', 'CIV-ASCS-021_TRS .docx') This proves that loop ran 4 times based on 4 files in the directory. I set the initial row pos to 0 outside of the for-loop.
Note: I am not showing any lines of code with regards to importing the necessary libraries.
files = glob('*.docx')
pos = 1
for i, wfile in enumerate(files[:1]):
document = Document(wfile)
table = document.tables[0]
data = []
keys = {}
for j, row in enumerate(table.rows):
text = (cell.text for cell in row.cells)
if j == 0:
keys = tuple(text)
continue
row_data = dict(zip(keys, text))
data.append(row_data)
tfile = tuple(files)
df = pd.DataFrame(data)
df.loc[-1] = [wfile, 'Test Case ID']
df.index = df.index + 1 # shifting index
df = df.sort_index() # sorting by index
df1 = df.rename(index=str, columns={"Test Case ID": "TC Attributes"})
df21 = df1.drop(columns = ['TC Attributes'])
df3 = df21.T
# read the existing sheets so that openpyxl won't create a new one later
book = load_workbook('test.xlsx')
writer = pd.ExcelWriter('test.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df3.to_excel(writer, 'sheet7', header = False, index = False, \
startrow = pos)
pos += 1
writer.save()

Categories