Read from mutiple excel and write to one file - python

I am trying to read data from multiple xls files and write it to one single file.
My code below is writing only the first file. Not sure what I am missing.
import glob import os import pandas as pd
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
r.append(os.path.join(root, name))
return r
files = list_files("C:\\Users\\12345\\BOFS")
for file in files:
df = pd.read_excel(file)
new_header = df.iloc[1]
df = df[2:]
df.columns = new_header
with pd.ExcelWriter("C:\\Users\\12345\\Test\\Test.xls", mode='a') as writer:
df.to_excel(writer,index=False, header=True,)

Documentation says:
ExcelWriter can also be used to append to an existing Excel file:
with pd.ExcelWriter('output.xlsx',
mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet_name_3')
And that probably replaces given sheet
But you could use pd.concat(<dataframes>) to concatenate dataframes and write all data at once in a single sheet.

I tested this piece of code, hopefully its work in your case.
import glob, os
os.chdir("D:/Data Science/stackoverflow")
for file in glob.glob("*.xlsx"):
df = pd.read_excel(file)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('output.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()

Related

Pandas - Move output columns into a new sheet

I have many dataframes as txt files that I'm converting into xlsx. For each file, I want to take my output columns and move them into a new sheet called "Analyzed Data". I'm not sure how to do this with ExcelWriter:
writer = pd.ExcelWriter('filepath', engine = 'xlsxwriter')
df.to_excel(writer, sheet_name = ' Data Analyzed')
writer.save()
My understanding is that this requires my file to be xlsx, I have to write the filepath separately for each xlsx file, and I'm not sure how to select only my output columns as the ones to move to the new sheet. Each file has a different amount of columns with different column names. My code is below:
import os
import pandas as pd
path = r'C:\Users\Me\1Test'
filelist = []
for root, dirs, files in os.walk(path):
for f in files:
if not f.endswith('.txt'):
continue
filelist.append(os.path.join(root, f))
for f in filelist:
df = pd.read_table(f)
col = df.iloc[ : , : -3]
df['Average'] = col.mean(axis = 1)
out = (df.join(df.drop(df.columns[[-3,-1]], axis=1)
.sub(df[df.columns[-3]], axis=0)
.add_suffix(' - Background')))
out.to_excel(f.replace('txt', 'xlsx'), 'Sheet1')

Pandas - Trying to store multiple .txt files in a .csv

I have a folder with about 500 .txt files. I would like to store the content in a csv file, with 2 columns, column 1 being the name of the file and column 2 being the file content in string. So I'd end up with a CSV file with 501 rows.
I've snooped around SO and tried to find similar questions, and came up with the following code:
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
for files in os.listdir(path):
with open(files, 'r') as file:
try:
df = pd.read_csv(file, header=None, delim_whitespace=True)
except EmptyDataError:
df = pd.DataFrame()
return df.to_csv('file.csv', index=False)
However it returns an empty .csv file. Am I doing something wrong?
There are several problems on your code. One of them is that pd.read_csv is not opening file because you're not passing the path to the given file. I think you should try to play from this code
import os
import pandas as pd
from pandas.io.common import EmptyDataError
def Aggregate_txt_csv(path):
files = os.listdir(path)
df = []
for file in files:
try:
d = pd.read_csv(os.path.join(path, file), header=None, delim_whitespace=True)
d["file"] = file
except EmptyDataError:
d = pd.DataFrame({"file":[file]})
df.append(d)
df = pd.concat(df, ignore_index=True)
df.to_csv('file.csv', index=False)
Use pathlib
Path.glob() to find all the files
When using path objects, file.stem returns the file name from the path.
Use pandas.concat to combine the dataframes in df_list
from pathlib import Path
import pandas as pd
p = Path('e:/PythonProjects/stack_overflow') # path to files
files = p.glob('*.txt') # get all txt files
df_list = list() # create an empty list for the dataframes
for file in files: # iterate through each file
with file.open('r') as f:
text = '\n'.join([line.strip() for line in f.readlines()]) # join all rows in list as a single string separated with \n
df_list.append(pd.DataFrame({'filename': [file.stem], 'contents': [text]})) # create and append a dataframe
df_all = pd.concat(df_list) # concat all the dataframes
df_all.to_csv('files.txt', index=False) # save to csv
I noticed there's already an answer, but I've gotten it to work with a relatively simple piece of code. I've only edited the file read-in a little bit, and the dataframe is outputting successfully.
Link here
import pandas as pd
from pandas.io.common import EmptyDataError
import os
def Aggregate_txt_csv(path):
result = []
print(os.listdir(path))
for files in os.listdir(path):
fullpath = os.path.join(path, files)
if not os.path.isfile(fullpath):
continue
with open(fullpath, 'r', errors='replace') as file:
try:
content = '\n'.join(file.readlines())
result.append({'title': files, 'body': content})
except EmptyDataError:
result.append({'title': files, 'body': None})
df = pd.DataFrame(result)
return df
df = Aggregate_txt_csv('files')
print(df)
df.to_csv('result.csv')
Most importantly here, I am appending to an array so as not to run pandas' concatenate function too much, as that would be pretty bad for performance. Additionally, reading in the file should not need read_csv, as there isn't a set format for the file. So using '\n'.join(file.readlines()) allows you to read in the file plainly and take out all lines into a string.
At the end, I convert the array of dictionaries into a final dataframe, and it returns the result.
EDIT: for paths that aren't the current directory, I updated it to append the path so that it could find the necessary files, apologies for the confusion

Multiple tabs in single excel

I am using below code to create a single excel with multiple tab based on the csv files present on path. I have two files in my path. so instead of getting two tabs in a single excel getting one tab with blank. Please help me to fix this code.
import os
import glob
import xlsxwriter
import csv
import pandas
path='/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/'
flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
workbook = xlsxwriter.Workbook('/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/split_book.xlsx')
for sh in flist:
worksheet = workbook.add_worksheet(sh)
with open(sh, 'rb') as f:
reader = csv.reader(f)
for r, row in enumerate(reader):
for c, col in enumerate(row):
worksheet.write(r, c, col)
workbook.close()
Three problems:
1) flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
Assuming that os.getcwd() is the same as your path, you will end up with the pathname twice. This means that flist will be empty. Since you have gone through the trouble of setting path, why not just
flist = [os.path.basename(x) for x in glob.glob(path + '*.csv')]
2) Same as above
workbook = path + 'split_book.xlsx'
3) The file should be opened as a text file
with open(sh, 'r') as f
Try that and your program should work. You don't need pandas for this - is that for later?
Read the files using pandas and combine all of them
import os
import pandas as pd
csv_names = [files for files in os.listdir("Your Directory/")] #get names of csv files in directory "Directory/"
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
for files in csv_names:
df = pd.read_csv(os.path.join("Your Directory",files)) #read csv file
filename = files[:-4] #remove ".csv" from filename
df.to_excel(writer, sheet_name=filename) #add to workbook
writer.save()
In short you can add a tab to an dataframe using
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name="SheetName")
df2.to_excel(writer, sheet_name="SheetName2")

Join multiple excel files into one excel file via python

I need to combine all the excel files in my directory into one excel file.
for example, I've 3 excel files:
file1:
file 2:
file 3:
I need to concatenate them and get an output as follows
but instead, they were appended one after another
this is my code :
import pandas as pd
import numpy as np
import glob
all_data = pd.DataFrame()
for f in glob.glob('C:/Users/test-results/FinalResult/05-01-2019/*.xlsx'):
df = pd.read_excel(f)
all_data = all_data.append(df, ignore_index=True)
writer = pd.ExcelWriter('mycollected_data.xlsx', engine='xlsxwriter')
all_data.to_excel(writer, sheet_name='Sheet1')
writer.save()
during my quest, all I found was how to append dfs,as shown in my code and I didn't figure out how too use join
You could use
files = glob.glob('C:/Users/test-results/FinalResult/05-01-2019/*.xlsx')
dfs = (pd.read_excel(f, index_col=0) for f in files)
all_data = pd.concat(dfs, axis=1)
Try this:
all_data = pd.concat([all_data,df],axis=1)
all_data = all_data.merge(df, on = ['first_column_name'], how = 'left')

Df export all into one larger file

I have a df reading in multiple .xlsx files. I have manipulated what I need in the files and the export view is exact. However, I need the data to export into one larger 2 column file rather than multiple individual files.
Any help is appreciated. I haven't been able to figure the problem out on my own.
import os
import glob
import pandas as pd
folder = input('Enter the folder name: ')
os.chdir('C:/Users/PCTR261010/Desktop/' + folder)
FileList = glob.glob('*.xlsx')
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df[['New', '<ID>']]
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()
You can append desired columns in a single DataFrame and write that DataFrame to an excel file. Below code should do the job.
import os
import glob
import pandas as pd
folder = input('Enter the folder name: ')
os.chdir('C:/Users/PCTR261010/Desktop/' + folder)
FileList = glob.glob('*.xlsx')
df1 = pd.DataFrame() # create an empty df
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df1.append(df[['New', '<ID>']]) # append columns data to the df1
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()
You can use pd.concat as follows:
data = []
for fname in FileList:
df = pd.read_excel(fname).assign(New=os.path.basename('mpcc_' + (fname.split('-', 1)[0]).split('#', 1)[1]))
df1 = df[['New', '<ID>']]
data.append(df1)
writer = pd.ExcelWriter('ParttoMPCC_Import.xlsx', engine='xlsxwriter')
df = pd.concat(data)
df.to_excel(writer, sheet_name='Import', index=False, header=False)
writer.save()

Categories