1.I am new to python.this task for mainly read the excel files in directory and filter the data in excel. After filtering write into excel.When iam trying to write to excel its storing only last iteration values.Please give advise to write all data to excel . I want to write df_filter and df_filter1 to excel which is for loop .Please help me i need to write these dataframe to excell
import os
import xlrd
import pandas as pd
import xlwt
from openpyxl import load_workbook
import xlsxwriter
from pyexcelerate import Workbook
import numpy as np
from pandas import ExcelWriter
from tempfile import TemporaryFile
ALL_SHEETS = []
sheet_list = ""
file_path = os.path.join(input("enter Dir path"))
config_path = os.path.join(input("enter your config file path here"))
output_path = os.path.join(input("Dude where you want store outputfile"))
output1 = pd.ExcelWriter(output_path, engine='xlsxwriter')
ALL_SHEETS = [os.path.join(file_path, f) for f in os.listdir(file_path)
if os.path.isfile(os.path.join(file_path, f))
and f.endswith('.xlsx')]
i = 0
data1 = []
data = []
Packet_size = []
Trail_numbers = []
Though_put = []
Latency = []
Jitter = []
df_filter = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
df_filter1 = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
#df_sheet = pd.DataFrame(columns=['zsheet'])
merged_inner=pd.DataFrame([])
def sheets(val):
s = wb.worksheets[val]
df_sheet = pd.DataFrame( data=['%s' % str(s) + '\n'])
#Name_sheet(s)
HeaderList = pd.read_csv(config_path)
column_list = []
for col in HeaderList:
col = col.lstrip("'")
col = col.rstrip("'")
column_list.append(col)
df1 = xl.parse(sheet_list[val], skiprows=i)
df1 = df1.filter(column_list)
df2 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'iMIX')]
if df2.empty:
pass
else:
final3= df2.groupby(['Trial Number', 'iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
#df_filter['sheetaname']=df_sheet(lambda a:'%s' % a['sheetvise'],axis=1)
final = final3.groupby(['iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter['packetsize'] = final.apply(lambda z: '%s' % (z['iMIX Distribution']), axis=1)
df_filter['throughput'] = final.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter['latency (us)'] = final.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter['jitter (us)'] = final.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter.to_excel(output1,sheet_name='mani')
output1.save()
df_filter.to_excel(output1, startrow=len(df_filter1)+len(df_filter)+2,sheet_name='mani')
output1.save()
df3 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'Fixed')]
if df3.empty:
pass
else:
final2 = df3.groupby(['Trial Number', 'Configured Frame Size'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
final1=final2.groupby(['Configured Frame Size'],sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter1['packetsize'] = final1.apply(lambda z: '%s' % (z['Configured Frame Size']), axis=1)
df_filter1['throughput'] = final1.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter1['latency (us)'] = final1.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter1['jitter (us)'] = final1.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter1.to_excel(output1, sheet_name='mani')
df_filter1.to_excel(output1, startrow=len(df_filter1)+len(df_filter) + 2, sheet_name='mani')
output1.save()
def sheet_every():
for sheet in range(0, sheet_list_lenght):
sheets(sheet)
for file in (ALL_SHEETS):
df_file = pd.DataFrame(data=[file])
workbook = xlrd.open_workbook(file)
wb = load_workbook(file)
xl = pd.ExcelFile(file)
i = 0
sheet_list = workbook.sheet_names()
sheet_list_lenght = (len(sheet_list))
for sheet in sheet_list:
worksheet = workbook.sheet_by_name(sheet)
for i in range(0, worksheet.nrows):
row = worksheet.row_values(i)
if 'Trial Number' in row:``
break
sheet_every()
Not sure if this answers your question or not, but if you want to read from a dataframe and add rows to a new dataframe thorugh a loop you can refer the code below:
dummyData = pd.read_csv("someexcelfile.csv")
#You can merge mutiple dataframes into dummyData and make it a big dataframe
dummyInsertTable = pd.DataFrame(columns=["Col1","Col2","Col3"])
for i in range(len(dummyData)):
dummyInsertTable.loc[i,"Col1"] = dummyData["Col1"][i]
dummyInsertTable.loc[i, "Col2"] = dummyData["Col2"][i]
dummyInsertTable.loc[i, "Col3"] = dummyData["Col3"][i]
dummyInsertTable.to_csv("writeCSVFile.csv")
And next time be precise where you are facing the problem.
EDIT
Try loading the first dataframe and then loop through the other files and append the files in the first dataframe. Refer the code:
import pandas as pd
#Make a list of all the file you have
filesList = ["/home/bhushan/firstFile.csv","/home/bhushan/secondFile.csv","/home/bhushan/thirdFile.csv","/home/bhushan/fourthFile.csv"]
#Read the first csv file using pandas.read_csv
firstFile = pd.read_csv(filesList[0])
#Loop through the rest of the files and append the files in the first DataFrame
for i in range(1,len(filesList)):
fileToBeAdded = pd.read_csv(filesList[i])
firstFile = firstFile.append(fileToBeAdded)
#Write the final file
finalFile = firstFile
finalFile.to_csv("finalFile.csv")
If I get your question correctly, you have two data frames which you want to write to one excel file but you are only getting the last one.
You should write them to two different sheets instead, then you can retrieve them as per requirement, either individually or combined.
Follow the below links for more details and implementation :
https://xlsxwriter.readthedocs.io/example_pandas_multiple.html
https://campus.datacamp.com/courses/importing-managing-financial-data-in-python/importing-stock-listing-data-from-excel?ex=11
Also, you can instead write to a csv file, that is also excel compatible and easier to handle. Also I have observed that it is faster and more space efficient compared to writing to .xlsx file.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html
Related
I have multiple CSV files to be imported into multiple worksheets named in the same as the CSV file.
However, I have difficulties in creating/appending multiple worksheets.
If I use ExcelWriter(pathDestination, mode = 'a'), FileNotFoundError happens.
If I use ExcelWriter(pathDestination), then only the last CSV file will be created in the worksheet.
How shall I improve the code without the need of listing down each csvpath when doing the to_excel?
import openpyxl
import numpy as np
import pandas as pd
import os
pathDestination = 'Downloads/TemplateOne.xlsx'
csvpathI = '2019_27101220_Export.csv'
csvpathII = '2019_27101220_Import.csv'
csvpathIII = '2020_27101220_Export.csv'
csvpathIV = '2020_27101220_Import.csv'
csvpath_list = [csvpathI, csvpathII, csvpathIII, csvpathIV]
for csvpath in csvpath_list:
df = pd.read_csv(csvpath)
conversion_unit = 1000
supplymentry_conversion_unit = 1000
df['quantity_converted'] = np.multiply(df['Quantity'],conversion_unit)
df['supplimentry_quantity_converted'] = np.multiply(df['Supplimentary Quantity'],conversion_unit)
csvnames = os.path.basename(csvpath).split(".")[0]
with pd.ExcelWriter(pathDestination) as writer:
df.to_excel(writer, sheet_name = csvnames, index = False)`
You need to put the loop inside the context manager in order to save each (df) to a separate sheet.
Try this :
conversion_unit = 1000
supplymentry_conversion_unit = 1000
with pd.ExcelWriter(pathDestination) as writer:
for csvpath in csvpath_list:
df = pd.read_csv(csvpath)
df['quantity_converted'] = df['Quantity'].mul(conversion_unit)
df['supplimentry_quantity_converted'] = df['Supplimentary Quantity'].mul(supplymentry_conversion_unit)
df.to_excel(writer, sheet_name = csvpath.split(".")[0], index = False)
So here is a sample of my excel layout:
But after merging it has two header and loses the layout.
Here is my code:
import pandas as pd
import glob
path = r"C:/Users//"
fname = glob.glob(path + "/*.xlsx")
result_DFs1 = pd.DataFrame()
result_DFs2 = pd.DataFrame()
for i in fname:
try:
df1 = pd.read_excel(i,sheet_name = "Test1")
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
for i in fname:
try:
df2 = pd.read_excel(i,sheet_name = "Test2")
result_DFs2 = pd.concat([result_DFs2, df2])
except:
pass
with pd.ExcelWriter('pandas_to_excel.xlsx') as writer:
result_DFs1.to_excel (writer, sheet_name='Test1')
result_DFs2.to_excel (writer, sheet_name='Test2')
Is there a way I can just have one header and without losing the excel layout format?
You can keep track of your sheets and only include headers for the first one. Something like:
first = True
for i in fname:
try:
if first:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=0, header=0)
first = False
else:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=1, header=None)
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
A csv file has 90 million rows. One of the Columns in named "State". It has 12 unique values at present. (The count of unique values in the "State" column is dynamic and can change with each csv file.)
I want to split the DataFrame into smaller chunks and then save State-wise files.
The code below is not working.
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=1000000)):
dfs = dict(tuple(chunk.groupby('State')))
for i, df in dfs.items():
df = df.append(df)
df.to_csv("tempcsv/" + i +".csv",sep=",", index = False)
IIUC, Try:
source_path = "DataJune.txt"
from collections import defaultdict
def def_value():
return pd.DataFrame()
# Defining the dict
d = defaultdict(def_value)
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
d[state]=d[state].append(chunk[chunk['State']==state])
for i, df in d.items():
df.to_csv("tempcsv/" + str(i) +".csv",sep=",", index = False)
Another version, based on the #Corralien comment:
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
with open("tempcsv/" + str(state) +".csv",mode='a+') as file:
for i, row in chunk[chunk['State']==state].iterrows():
file.write(','.join([str(x) for x in row]))
file.write('\n')
Another version:
source_path = "DataJune.txt"
from os.path import exists
import csv
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
path = "tempcsv/" + str(state) +".csv"
file_exists = exists(path)
if not file_exists:
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerow(chunk.columns)
print(chunk.columns)
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerows(chunk[chunk['State']==state].values)
You can use:
import pandas as pd
import os
source_path = 'DataJune.txt'
fps = {}
for chunk in pd.read_csv(source_path, sep='|', chunksize=1000000, dtype=object):
for state, df in chunk.groupby('State'):
# New state, create a new file and write headers
if state not in fps:
fps[state] = open(f'tempcsv/{state}.csv', 'w')
fps[state].write(f"{','.join(df.columns)}{os.linesep}")
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
# Close files properly
for fp in fps.values():
fp.close()
del fps
Update
Try to replace:
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
By
# Write data without headers
g = (row.strip() for row in df.to_csv(index=False, header=None, sep=',').split(os.linesep) if row)
print(*g, sep=os.linesep, file=fps[state])
I am have a code to merge few excel together using Python, but i cant realy rename any thing in that dataframe using df.rename(). could someone explain why? Thanks!
import os
import xlrd
import pandas as pd
def file_name(file_dir):
list=[]
for file in os.listdir(file_dir):
if os.path.splitext(file)[1] == '.xlsx':
list.append(file)
return list
path = r'E:\Sync\External\Test'
wks = file_name(path)
data = []
for i in range(len(wks)):
read_xlsx = xlrd.open_workbook(path + '\\' + wks[i])
sheet1 = read_xlsx.sheets()[1]
nrow = sheet1.nrows
title = sheet1.row_values(0)
location = os.path.splitext(wks[i])[0]
for j in range(6,nrow):
a = sheet1.row_values(j)
a.insert(0,location)
print(a)
data.append(a)
content= pd.DataFrame(data)
content.rename({'0': 'X', '1': 'Y'}, axis=1, inplace=True)
#content.to_csv(path+'\\test.xlsx', sep=',', header=True, index=False)
content.to_excel(path+'\\test.xlsx', header=True, index=False)
Code as above, no error shows,but it's just doesn't work (rename part)
I'm new to pandas/python and Ive come up with the following code to extract data from a specific part of a worksheet.
import openpyxl as xl
import pandas as pd
rows_with_data = [34,37,38,39,44,45,46,47,48,49, 50,54,55,57,58,59,60,62,63,64,65,66,70,71,72,76,77, 78,79,80,81,82,83,84,88,89,90,91,92]
path = r'XXX'
xpath = input('XXX')
file = r'**.xlsm'
xfile = input('Change file name, current is ' + file + ' :')
sheetname = r'Summary'
wb = xl.load_workbook(filename = xpath + '\\' +file, data_only = True)
sheet = wb.get_sheet_by_name(sheetname)
rows = len(rows_with_data)
line_items = []
for i in range(rows) :
line_items.append(sheet.cell(row = rows_with_data[i], column = 13).value)
period = []
for col in range(17,35):
period.append(sheet.cell(row = 20, column = col).value)
print(line_items)
vals = []
x = []
for i in range(rows):
if i != 0:
vals.append(x)
x = []
for col in range(17,35):
x.append(sheet.cell(row = rows_with_data[i], column = col).value)
vals.append(x)
all_values = {}
all_values['Period'] = period
for i in range(rows):
print(line_items[i])
all_values[line_items[i]] = vals[i]
print(all_values)
period_review = input('Enter a period (i.e. 2002): ')
item = input('Enter a period (i.e. XXX): ')
time = period.index(period_review)
display_item = str(all_values[item][time])
print(item + ' for ' + period_review + " is " + display_item)
Summary_Dataframe = pd.DataFrame(all_values)
writer = pd.ExcelWriter(xpath + '\\' + 'values.xlsx')
Summary_Dataframe.to_excel(writer,'Sheet1')
writer.save()
writer.close()
I have the same worksheet (summary results) across a library of 60 xlsm files and I'm having a hard time figuring out how to iterate this across the entire folder of files. I also want change this from extracting specific rows to taking the entire "Summary" worksheet, pasting it to the new file and naming the worksheet by its filename ("Experiment_A") when pasted to the new excel file. Any advice?
I was having hard time to read your code to understand that what you want to do finally. So it is just an advice not a solution. You can iterate through all files in the folder using os then read the files in to one dataframe then save the single big data frame in to csv. I usually avoid excel but I guess you need the excel conversion. In the example below I have read all txt file from a directory put them in to dataframe list then store the big data frame as json. You can also store it as excel/csv.
import os
import pandas as pd
def process_data():
# input file path in 2 part in case it is very long
input_path_1 = r'\\path\to\the\folder'
input_path_2 = r'\second\part\of\the\path'
# adding the all file path
file_path = input_path_1 + input_path_2
# listing all file in the file folder
file_list = os.listdir(os.path.join(file_path))
# selecting only the .txt files in to a list object
file_list = [file_name for file_name in file_list if '.txt' in file_name]
# selecting the fields we need
field_names = ['country', 'ticket_id']
# defining a list to put all the datafremes in one list
pd_list = []
inserted_files = []
# looping over txt files and storing in to database
for file_name in file_list:
# creating the file path to read the file
file_path_ = file_path + '\\' + file_name
df_ = pd.read_csv(os.path.join(file_path_), sep='\t', usecols=field_names)
# converting the datetime to date
# few internal data transformation example before writting
df_['sent_date'] = pd.to_datetime(df_['sent_date'])
df_['sent_date'] = df_['sent_date'].values.astype('datetime64[M]')
# adding each dataframe to the list
pd_list.append(df_)
# adding file name to the inserted list to print later
inserted_files.append(file_name)
print(inserted_files)
# sql like union all dataframes and create a single data source
df_ = pd.concat(pd_list)
output_path_1 = r'\\path\to\output'
output_path_2 = r'\path\to\output'
output_path = output_path_1 + output_path_2
# put the file name
file_name = 'xyz.json'
# adding the day the file processed
df_['etl_run_time'] = pd.to_datetime('today').strftime('%Y-%m-%d')
# write file to json
df_.to_json(os.path.join(output_path, file_name), orient='records')
return print('Data Stored as json successfully')
process_data()