I am using below code to create a single excel with multiple tab based on the csv files present on path. I have two files in my path. so instead of getting two tabs in a single excel getting one tab with blank. Please help me to fix this code.
import os
import glob
import xlsxwriter
import csv
import pandas
path='/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/'
flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
workbook = xlsxwriter.Workbook('/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/split_book.xlsx')
for sh in flist:
worksheet = workbook.add_worksheet(sh)
with open(sh, 'rb') as f:
reader = csv.reader(f)
for r, row in enumerate(reader):
for c, col in enumerate(row):
worksheet.write(r, c, col)
workbook.close()
Three problems:
1) flist = [os.path.basename(x) for x in glob.glob(os.getcwd() + '/axp/buanalytics/csgsn/dev/GSN/VGEN_Files/Test/Tulu/VG/Data/*.csv')]
Assuming that os.getcwd() is the same as your path, you will end up with the pathname twice. This means that flist will be empty. Since you have gone through the trouble of setting path, why not just
flist = [os.path.basename(x) for x in glob.glob(path + '*.csv')]
2) Same as above
workbook = path + 'split_book.xlsx'
3) The file should be opened as a text file
with open(sh, 'r') as f
Try that and your program should work. You don't need pandas for this - is that for later?
Read the files using pandas and combine all of them
import os
import pandas as pd
csv_names = [files for files in os.listdir("Your Directory/")] #get names of csv files in directory "Directory/"
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
for files in csv_names:
df = pd.read_csv(os.path.join("Your Directory",files)) #read csv file
filename = files[:-4] #remove ".csv" from filename
df.to_excel(writer, sheet_name=filename) #add to workbook
writer.save()
In short you can add a tab to an dataframe using
writer = pd.ExcelWriter('Multiple Workbooks.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name="SheetName")
df2.to_excel(writer, sheet_name="SheetName2")
Related
I have many dataframes as txt files that I'm converting into xlsx. For each file, I want to take my output columns and move them into a new sheet called "Analyzed Data". I'm not sure how to do this with ExcelWriter:
writer = pd.ExcelWriter('filepath', engine = 'xlsxwriter')
df.to_excel(writer, sheet_name = ' Data Analyzed')
writer.save()
My understanding is that this requires my file to be xlsx, I have to write the filepath separately for each xlsx file, and I'm not sure how to select only my output columns as the ones to move to the new sheet. Each file has a different amount of columns with different column names. My code is below:
import os
import pandas as pd
path = r'C:\Users\Me\1Test'
filelist = []
for root, dirs, files in os.walk(path):
for f in files:
if not f.endswith('.txt'):
continue
filelist.append(os.path.join(root, f))
for f in filelist:
df = pd.read_table(f)
col = df.iloc[ : , : -3]
df['Average'] = col.mean(axis = 1)
out = (df.join(df.drop(df.columns[[-3,-1]], axis=1)
.sub(df[df.columns[-3]], axis=0)
.add_suffix(' - Background')))
out.to_excel(f.replace('txt', 'xlsx'), 'Sheet1')
Most of the articles I'm seeing either:
a) Combine multiple excel single-sheet workbooks into one master workbook with just a single sheet or;
b) Split a multiple-sheet excel workbook into individual workbooks.
However, my goal is to grab all the excel files in a specific folder and save them as individual sheets within one new master excel workbook. I'm trying to rename each sheet name as the name of the original file.
import pandas as pd
import glob
import os
file = "C:\\File\\Path\\"
filename = 'Consolidated Files.xlsx'
pth = os.path.dirname(file)
extension = os.path.splitext(file)[1]
files = glob.glob(os.path.join(pth, '*xlsx'))
w = pd.ExcelWriter(file + filename)
for f in files:
print(f)
df = pd.read_excel(f, header = None)
print(df)
df.to_excel(w, sheet_name = f, index = False)
w.save()
How do I adjust the names for each sheet? Also, if you see any opportunities to clean this up please let me know
You cannot rename sheet with special characters because f is full path and file name. You should use only filename to names sheetname, Use os.path.basename to get file name and use split to seperate file name and extension.
for f in files:
print(f)
df = pd.read_excel(f, header = None)
print(df)
# Use basename to get filename with extension
# Use split to seperate filename and extension
new_sheet_name = os.path.basename(f).split('.')[0]
#
df.to_excel(w, sheet_name = new_sheet_name , index = False)
I decided to put my solution here as well, just in case it would be useful to anyone.
Thing is, I wanted to be able to recall where the end sheet came from. However, source workbooks can (and likely will) often have same sheet names like "Sheet 1", so I couldn't just use sheet names from original workbooks. I also could not use source filenames as sheet names since they might be longer than 31 character, which is maximum sheet name length allowed by Excel.
Therefore, I ended up assigning incremental numbers to resulting sheet names, while simultaneously inserting a new column named "source" at the start of each sheet and populating it with file name concatenated with sheet name. Hope it might help someone :)
from glob import glob
import pandas as pd
import os
files_input = glob(r'C:\Path\to\folder\*.xlsx')
result_DFs = []
for xlsx_file in files_input:
file_DFs = pd.read_excel(xlsx_file, sheet_name=None)
# save every sheet from every file as dataframe to an array
for sheet_DF in file_DFs:
source_name = os.path.basename(xlsx_file) + ":" + sheet_DF
file_DFs[sheet_DF].insert(0, 'source', source_name)
result_DFs.append(file_DFs[sheet_DF])
with pd.ExcelWriter(r'C:\Path\to\resulting\file.xlsx') as writer:
for df_index in range(len(result_DFs)):
# write dataframe to file using simple incremental number as a new sheet name
result_DFs[df_index].to_excel(writer, sheet_name=str(df_index), index=False)
# auto-adjust column width (can be omitted if not needed)
for i, col in enumerate(result_DFs[df_index].columns):
column_len = max(result_DFs[df_index][col].astype(str).str.len().max(), len(col) + 3)
_ = writer.sheets[str(df_index)].set_column(i, i, column_len)
I am trying to read data from multiple xls files and write it to one single file.
My code below is writing only the first file. Not sure what I am missing.
import glob import os import pandas as pd
def list_files(dir):
r = []
for root, dirs, files in os.walk(dir):
for name in files:
r.append(os.path.join(root, name))
return r
files = list_files("C:\\Users\\12345\\BOFS")
for file in files:
df = pd.read_excel(file)
new_header = df.iloc[1]
df = df[2:]
df.columns = new_header
with pd.ExcelWriter("C:\\Users\\12345\\Test\\Test.xls", mode='a') as writer:
df.to_excel(writer,index=False, header=True,)
Documentation says:
ExcelWriter can also be used to append to an existing Excel file:
with pd.ExcelWriter('output.xlsx',
mode='a') as writer:
df.to_excel(writer, sheet_name='Sheet_name_3')
And that probably replaces given sheet
But you could use pd.concat(<dataframes>) to concatenate dataframes and write all data at once in a single sheet.
I tested this piece of code, hopefully its work in your case.
import glob, os
os.chdir("D:/Data Science/stackoverflow")
for file in glob.glob("*.xlsx"):
df = pd.read_excel(file)
all_data = all_data.append(df,ignore_index=True)
# now save the data frame
writer = pd.ExcelWriter('output.xlsx')
all_data.to_excel(writer,'sheet1')
writer.save()
I am trying to write a code which takes all the .csv files in a directory, which are semi colon delimited, and formats the .csv file into columns. This is my code:
import pandas as pd
import glob
path = r'C:...'
csv_file = path + "\*.csv"
allFiles = glob.glob(path + "\*.csv")
for file in allFiles:
dataframe = pd.read_csv(file, delimiter=';')
dataframe.to_csv(file, encoding='utf-8', index=False)
I have tested the dataframe = part of this code, it works as desired for one .csv file, but I cannot get this to repeat for all files in the folder. Any ideas? Thanks.
If all you want to do is change ; to , in the files, something like this would work:
for root, dirs, files in os.walk("/dirname/"):
csv_files = [ff for ff in files if ff.endswith('.csv')]
for f in csv_files:
with open(f) as tf:
s = f.read()
with open(f, "w") as tf:
f.write(s.replace(";", ","))
You can use pandas and do something like this:
import pandas as pd
df1 = pd.read_csv("csv_semicolon.csv", delimiter=";")
df1.to_csv("csv_tab.csv", sep="\t", index=False)
So I'm attempting exclude the top three rows during a data extraction.
for col_num in xrange(sheet.ncols):
col = sheet.col_values(col_num, start_rowx=3, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results as well
This for loop eliminates the top 3 rows put then turns the rows into columns.
Any advice on how to maintain the data structure but at the same time eliminate rows?
Full script below:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for colx in xrange(sheet.ncols):
col = sheet.col_values(colx, start_rowx=2, end_rowx=None)
writer.writerow(col) #this syntax also may be skewing my results
Thank you!
Any help is much appreciated!
If you want row values, why are you pulling the columns to write as rows? Pull the row values and write those:
import glob
import os
import xlrd
import csv
ROOTDIR = r'C:\Users\username\Desktop\Mults'
wb_pattern = os.path.join(ROOTDIR, '*.xlsx')
workbooks = glob.glob(wb_pattern)
start_rownum = 3 # or wherever you want to start copying
with open('merged.csv', 'wb') as outcsv:
writer = csv.writer(outcsv)
for wb in workbooks:
book_path = os.path.join(ROOTDIR, wb)
book = xlrd.open_workbook(book_path)
sheet = book.sheet_by_index(0)
for rownum in xrange(start_rownum, sheet.numrows):
row = sheet.row_values(rownum)
writer.writerow(row)