Df export all into one larger file - python

I have a df reading in multiple .xlsx files. I have manipulated what I need in the files and the export view is exact. However, I need the data to export into one larger 2 column file rather than multiple individual files.
Any help is appreciated. I haven't been able to figure the problem out on my own.
import os
import glob
import pandas as pd
folder = input('Enter the folder name: ')
os.chdir('C:/Users/PCTR261010/Desktop/' + folder)
FileList = glob.glob('*.xlsx')
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df[['New', '<ID>']]
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()

You can append desired columns in a single DataFrame and write that DataFrame to an excel file. Below code should do the job.
import os
import glob
import pandas as pd
folder = input('Enter the folder name: ')
os.chdir('C:/Users/PCTR261010/Desktop/' + folder)
FileList = glob.glob('*.xlsx')
df1 = pd.DataFrame() # create an empty df
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df1.append(df[['New', '<ID>']]) # append columns data to the df1
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()

You can use pd.concat as follows:
data = []
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df[['New', '<ID>']]
data.append(df1)
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df = pd.concat(data)
df.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()

Related

Keep enter space in text column when converting to csv

I have data in excel where is text with enter space in last column. Here is examples of my data:
If I convert using python to csv, my data looks like this:
I need the TEXT column will be like this:
This is my script:
import pandas as pd
import os
import numpy as np
WD = r'XXX'
os.chdir(WD)
for file in os.listdir(WD):
if file.endswith('.xlsx'):
FILE = file
sheet_names = pd.ExcelFile(FILE).sheet_names
for sn in sheet_names:
OUTPUT_FILE = '{}_{}'.format(sn,FILE.replace('.xlsx','.csv'))
df = pd.read_excel(FILE,)
print(FILE, sn)
for col in df.columns.to_list():
df[col] = df[col].map({True: '', False: ''}).fillna(df[col])
cn = ['IN', 'NAME', 'TEXT']
df = df.reindex(columns = cn)
df.to_csv(OUTPUT_FILE,sep='|',encoding='utf-8-sig',index=False)
Do you have any idea?
I hope this works for your solution, (pip install xlsxwriter) before executing
Excel to csv:
import pandas as pd
df = pd.read_excel('./keep_enter.xlsx')
def replace_custom_func(x):
new_str = ''
if len(x) > 0:
for i in x.split('\n'):
new_str += f'"{i}"&CHAR(10)&'
return "=" + new_str[:-10]
else:
return x
df['Text'] = df['Text'].apply(lambda x: replace_custom_func(x))
df.to_csv('keep_enter1.csv', sep='|', index=False)
CSV to Excel:
df = pd.read_csv('./keep_enter1.csv', sep='|')
writer = pd.ExcelWriter('new_excel_replace12345.xlsx', engine='xlsxwriter')
# # Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name='Sheet1', index=False)
# # Get the xlsxwriter workbook and worksheet objects.
workbook = writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column('C:D', None, format)
worksheet.write_formula(1, 2, df['Text'][0])
# # Close the Pandas Excel writer and output the Excel file.
writer.save()
Output:

Pandas - Move output columns into a new sheet

I have many dataframes as txt files that I'm converting into xlsx. For each file, I want to take my output columns and move them into a new sheet called "Analyzed Data". I'm not sure how to do this with ExcelWriter:
writer = pd.ExcelWriter('filepath', engine = 'xlsxwriter')
df.to_excel(writer, sheet_name = ' Data Analyzed')
writer.save()
My understanding is that this requires my file to be xlsx, I have to write the filepath separately for each xlsx file, and I'm not sure how to select only my output columns as the ones to move to the new sheet. Each file has a different amount of columns with different column names. My code is below:
import os
import pandas as pd
path = r'C:\Users\Me\1Test'
filelist = []
for root, dirs, files in os.walk(path):
for f in files:
if not f.endswith('.txt'):
continue
filelist.append(os.path.join(root, f))
for f in filelist:
df = pd.read_table(f)
col = df.iloc[ : , : -3]
df['Average'] = col.mean(axis = 1)
out = (df.join(df.drop(df.columns[[-3,-1]], axis=1)
.sub(df[df.columns[-3]], axis=0)
.add_suffix(' - Background')))
out.to_excel(f.replace('txt', 'xlsx'), 'Sheet1')

Read from mutiple excel and write to one file

I am trying to read data from multiple xls files and write it to one single file.
My code below is writing only the first file. Not sure what I am missing.
import glob import os import pandas as pd
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
r.append(os.path.join(root, name))
return r
files = list_files("C:\\Users\\12345\\BOFS")
for file in files:
df = pd.read_excel(file)
new_header = df.iloc[1]
df = df[2:]
df.columns = new_header
with pd.ExcelWriter("C:\\Users\\12345\\Test\\Test.xls", mode='a') as writer:
df.to_excel(writer,index=False, header=True,)
Documentation says:
ExcelWriter can also be used to append to an existing Excel file:
with pd.ExcelWriter('output.xlsx',
mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet_name_3')
And that probably replaces given sheet
But you could use pd.concat(<dataframes>) to concatenate dataframes and write all data at once in a single sheet.
I tested this piece of code, hopefully its work in your case.
import glob, os
os.chdir("D:/Data Science/stackoverflow")
for file in glob.glob("*.xlsx"):
df = pd.read_excel(file)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('output.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()

Join multiple excel files into one excel file via python

I need to combine all the excel files in my directory into one excel file.
for example, I've 3 excel files:
file1:
file 2:
file 3:
I need to concatenate them and get an output as follows
but instead, they were appended one after another
this is my code :
import pandas as pd
import numpy as np
import glob
all_data = pd.DataFrame()
for f in glob.glob('C:/Users/test-results/FinalResult/05-01-2019/*.xlsx'):
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
writer = pd.ExcelWriter('mycollected_data.xlsx', engine='xlsxwriter')
all_data.to_excel(writer, sheet_name='Sheet1')
writer.save()
during my quest, all I found was how to append dfs,as shown in my code and I didn't figure out how too use join
You could use
files = glob.glob('C:/Users/test-results/FinalResult/05-01-2019/*.xlsx')
dfs = (pd.read_excel(f, index_col=0) for f in files)
all_data = pd.concat(dfs, axis=1)
Try this:
all_data = pd.concat([all_data,df],axis=1)
all_data = all_data.merge(df, on = ['first_column_name'], how = 'left')

Excel file overwritten instead of concat - Python - Pandas

I'm trying to contact all excel files and worksheets in them into one using the below script. It kinda works but then the excel file c.xlsx is overwritten per file, so only the last excel file is concated not sure why?
import pandas as pd
import os
import ntpath
import glob
dir_path = os.path.dirname(os.path.realpath(__file__))
os.chdir(dir_path)
cdf = None
for excel_names in glob.glob('*.xlsx'):
print(excel_names)
df = pd.read_excel(excel_names, sheet_name=None, ignore_index=True)
cdf = pd.concat(df.values())
cdf.to_excel("c.xlsx", header=False, index=False)
Idea is create list of DataFrames in list comprehension, but because working with orderdict is necessary concat in loop and then again concat for one big final DataFrame:
cdf = [pd.read_excel(excel_names, sheet_name=None, ignore_index=True).values()
for excel_names in glob.glob('files/*.xlsx')]
df = pd.concat([pd.concat(x) for x in cdf], ignore_index=True)
#print (df)
df.to_excel("c.xlsx", index=False)
I just tested the code below. It merges data from all Excel files in a folder into one, single, Excel file.
import pandas as pd
import numpy as np
import glob
glob.glob("C:\\your_path\\*.xlsx")
all_data = pd.DataFrame()
for f in glob.glob("C:\\your_path\\*.xlsx"):
df = pd.read_excel(f)
all_data = all_data.append(df,ignore_index=True)
print(all_data)
df = pd.DataFrame(all_data)
df.shape
df.to_excel("C:\\your_path\\final.xlsx", sheet_name='Sheet1')
I got it working using the below script which uses #ryguy72's answer but works on all worksheets as well as the header row.
import pandas as pd
import numpy as np
import glob
all_data = pd.DataFrame()
for f in glob.glob("my_path/*.xlsx"):
df = pd.read_excel(f, sheet_name=None, ignore_index=True)
cdf = pd.concat(df.values())
all_data = all_data.append(cdf,ignore_index=True)
print(all_data)
df = pd.DataFrame(all_data)
df.shape
df.to_excel("my_path/final.xlsx", sheet_name='Sheet1')

Categories