How to merge excel file without losing its format - python

So here is a sample of my excel layout:
But after merging it has two header and loses the layout.
Here is my code:
import pandas as pd
import glob
path = r"C:/Users//"
fname = glob.glob(path + "/*.xlsx")
result_DFs1 = pd.DataFrame()
result_DFs2 = pd.DataFrame()
for i in fname:
try:
df1 = pd.read_excel(i,sheet_name = "Test1")
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
for i in fname:
try:
df2 = pd.read_excel(i,sheet_name = "Test2")
result_DFs2 = pd.concat([result_DFs2, df2])
except:
pass
with pd.ExcelWriter('pandas_to_excel.xlsx') as writer:
result_DFs1.to_excel (writer, sheet_name='Test1')
result_DFs2.to_excel (writer, sheet_name='Test2')
Is there a way I can just have one header and without losing the excel layout format?

You can keep track of your sheets and only include headers for the first one. Something like:
first = True
for i in fname:
try:
if first:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=0, header=0)
first = False
else:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=1, header=None)
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass

Related

How to resolve DtypeWarning: Columns (23) have mixed types. Specify dtype option on import or set low_memory=False?

I'm having this error in my script "DtypeWarning: Columns (23) have mixed types. Specify dtype option on import or set low_memory=False", though the file is being created, I feel like it needs to be corrected. I have tried inserting this syntax in my code but it doesn't work:
new_df = pd.read_csv('partial.csv', low_memory=False)
What and where should I add in my code?
import os
import pandas as pd
import tkinter
from tkinter import messagebox
root = tkinter.Tk()
root.withdraw()
directory = 'C:/path'
ext = ('.csv')
for filename in os.listdir(directory):
f = os.path.join(directory, filename)
if f.endswith(ext):
head_tail = os.path.split(f)
head_tail1 = 'C:/Output'
k =head_tail[1]
r=k.split(".")[0]
p=head_tail1 + "/" + r + " - .csv"
mydata = pd.read_csv(f)
# to pull columns and values
new = mydata[["A","B","C","D"]]
new = new.rename(columns={'D': 'F'})
new['F'] = 1
print(new.columns)
new["B"] = (pd.to_datetime(new["B"], format="%d-%b", errors="coerce").dt.strftime("%#m-%#d").fillna(new["B"]))
new.to_csv(p ,index=False)
#to merge columns and values
merge_columns = ['A', 'B', 'C']
merged_col = ''.join(merge_columns).replace('ABC', 'G')
new[merged_col] = new[merge_columns].astype(str).apply(lambda x: '.'.join(x), axis=1)
new.drop(merge_columns, axis=1, inplace=True)
new = new.groupby(merged_col).count().reset_index()
new.to_csv(p, index=False)
messagebox.showinfo("Parts has been counted", "New csv file can be found in C:/Path")

Split large DataFrame into Dataframes containing records of unique values in a column

A csv file has 90 million rows. One of the Columns in named "State". It has 12 unique values at present. (The count of unique values in the "State" column is dynamic and can change with each csv file.)
I want to split the DataFrame into smaller chunks and then save State-wise files.
The code below is not working.
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=1000000)):
dfs = dict(tuple(chunk.groupby('State')))
for i, df in dfs.items():
df = df.append(df)
df.to_csv("tempcsv/" + i +".csv",sep=",", index = False)
IIUC, Try:
source_path = "DataJune.txt"
from collections import defaultdict
def def_value():
return pd.DataFrame()
# Defining the dict
d = defaultdict(def_value)
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
d[state]=d[state].append(chunk[chunk['State']==state])
for i, df in d.items():
df.to_csv("tempcsv/" + str(i) +".csv",sep=",", index = False)
Another version, based on the #Corralien comment:
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
with open("tempcsv/" + str(state) +".csv",mode='a+') as file:
for i, row in chunk[chunk['State']==state].iterrows():
file.write(','.join([str(x) for x in row]))
file.write('\n')
Another version:
source_path = "DataJune.txt"
from os.path import exists
import csv
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
path = "tempcsv/" + str(state) +".csv"
file_exists = exists(path)
if not file_exists:
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerow(chunk.columns)
print(chunk.columns)
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerows(chunk[chunk['State']==state].values)
You can use:
import pandas as pd
import os
source_path = 'DataJune.txt'
fps = {}
for chunk in pd.read_csv(source_path, sep='|', chunksize=1000000, dtype=object):
for state, df in chunk.groupby('State'):
# New state, create a new file and write headers
if state not in fps:
fps[state] = open(f'tempcsv/{state}.csv', 'w')
fps[state].write(f"{','.join(df.columns)}{os.linesep}")
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
# Close files properly
for fp in fps.values():
fp.close()
del fps
Update
Try to replace:
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
By
# Write data without headers
g = (row.strip() for row in df.to_csv(index=False, header=None, sep=',').split(os.linesep) if row)
print(*g, sep=os.linesep, file=fps[state])

How to add name of csv files as values in a column while merging 1000+ files?

I am trying to merge 1000+ csv files using the following code:
path = r'path_to_files/'
all_files = glob.glob(path + "/*.csv")
import shutil
with open('updated_thirteen_jan.csv','wb') as wfd:
for f in all_files:
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
I am using the above code to avoid ram crashing problem, it is working fine. However, I would like to do what the following code does for me:
path = r'path_to_files/'
all_files = glob.glob(path + "/*.csv")
fields = ['col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8']
li = []
first_one = True
for filename in all_files:
if not first_one: # if it is not the first csv file then skip the header row (row 0) of that file
skip_row = [0]
else:
skip_row = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, skiprows = skip_row, engine='python', usecols=fields)
df = df[(df['lang'] == 'en')]
filename = os.path.basename(filename)
df['file_name'] = filename
li.append(df)
frame = pd.concat(li, axis=0, ignore_index=True)
From this code, I would like to be able to perform column selection fileds, row_skip and adding file_name as a value.
Any guidance please?
If memory is the constraint, then one pandas-based solution is to iterate over chunks of rows:
import os
import pandas as pd
print(pd.__version__)
# works with this version: '1.3.4'
# gen sample files
all_files = [f"{_}.csv" for _ in range(3)]
for filename in all_files:
df = pd.DataFrame(range(3))
df.to_csv(filename, index=False)
# combine into one
mode = "w"
header = True
for filename in all_files:
with pd.read_csv(
filename,
engine="python",
iterator=True,
chunksize=10_000,
) as reader:
for df in reader:
filename = os.path.basename(filename)
df["file_name"] = filename
df.to_csv("some_file.csv", index=False, mode=mode, header=header)
mode = "a"
header = False
The good old csv module can process one row at a time, so memory will not be an issue. The following code will concatenate the csv files keeping only the first header, and adding a filename column populated with the filename.
path = r'path_to_files/'
all_files = glob.glob(path + "/*.csv")
import csv
with open('updated_thirteen_jan.csv','w', newline='') as wfd:
wr = csv.writer(wfd)
first = True
for f in all_files:
with open(f) as fd:
rd = csv.reader(fd)
# skip header line, except for the first file
row = next(rd)
if first:
row.append('filename')
wr.writerow(row)
first = False
for row in rd:
row.append(f)
wr.writerow(row)
Another solution is to use dask:
# pip install dask
import dask.dataframe as dd
# dd.read_csv is mostly compatible with pd.read_csv options
# so can specify reading specific columns, etc.
ddf = dd.read_csv("some_path/*.csv")
ddf.to_csv('merged_file.csv', index=False, single_file=True)
Read one file at a time into pandas dataframe, add new column to it and write it to a new file.
import os
import glob
import pathlib
path = 'path_to_files/'
out_file = 'updated_thirteen_jan.csv'
all_files = glob.glob(path + '*.csv')
all_files = sorted([pathlib.Path(i) for i in all_files])
keep_cols = ['list', 'of', 'columns', 'to', 'keep']
skip_row = 2 # number of rows to skip
for fn in all_files:
temp = pd.read_csv(fn, usecols=keep_cols, skiprows=skip_row)
temp['filename'] = fn.stem
temp.to_csv(out_file, mode='a', index=False, header=not os.path.isfile(out_file))
If reading entire csv into memory isn't feasible, then use chunksize. Modify this value per your machine capacity.
for fn in all_files:
reader = pd.read_csv(fn, usecols=keep_cols, skiprows=skip_row, chunksize=5000)
for idx, df in enumerate(reader):
df['filename'] = fn.stem
df.to_csv(out_file, mode='a', index=False, header=not os.path.isfile(out_file))

Change Colume Name in dataframe and melt it

I am have a code to merge few excel together using Python, but i cant realy rename any thing in that dataframe using df.rename(). could someone explain why? Thanks!
import os
import xlrd
import pandas as pd
def file_name(file_dir):
list=[]
for file in os.listdir(file_dir):
if os.path.splitext(file)[1] == '.xlsx':
list.append(file)
return list
path = r'E:\Sync\External\Test'
wks = file_name(path)
data = []
for i in range(len(wks)):
read_xlsx = xlrd.open_workbook(path + '\\' + wks[i])
sheet1 = read_xlsx.sheets()[1]
nrow = sheet1.nrows
title = sheet1.row_values(0)
location = os.path.splitext(wks[i])[0]
for j in range(6,nrow):
a = sheet1.row_values(j)
a.insert(0,location)
print(a)
data.append(a)
content= pd.DataFrame(data)
content.rename({'0': 'X', '1': 'Y'}, axis=1, inplace=True)
#content.to_csv(path+'\\test.xlsx', sep=',', header=True, index=False)
content.to_excel(path+'\\test.xlsx', header=True, index=False)
Code as above, no error shows,but it's just doesn't work (rename part)

i want to write looping dataframe to excel

1.I am new to python.this task for mainly read the excel files in directory and filter the data in excel. After filtering write into excel.When iam trying to write to excel its storing only last iteration values.Please give advise to write all data to excel . I want to write df_filter and df_filter1 to excel which is for loop .Please help me i need to write these dataframe to excell
import os
import xlrd
import pandas as pd
import xlwt
from openpyxl import load_workbook
import xlsxwriter
from pyexcelerate import Workbook
import numpy as np
from pandas import ExcelWriter
from tempfile import TemporaryFile
ALL_SHEETS = []
sheet_list = ""
file_path = os.path.join(input("enter Dir path"))
config_path = os.path.join(input("enter your config file path here"))
output_path = os.path.join(input("Dude where you want store outputfile"))
output1 = pd.ExcelWriter(output_path, engine='xlsxwriter')
ALL_SHEETS = [os.path.join(file_path, f) for f in os.listdir(file_path)
if os.path.isfile(os.path.join(file_path, f))
and f.endswith('.xlsx')]
i = 0
data1 = []
data = []
Packet_size = []
Trail_numbers = []
Though_put = []
Latency = []
Jitter = []
df_filter = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
df_filter1 = pd.DataFrame(columns=['packetsize', 'throughput', 'latency (us)', 'jitter (us)'])
#df_sheet = pd.DataFrame(columns=['zsheet'])
merged_inner=pd.DataFrame([])
def sheets(val):
s = wb.worksheets[val]
df_sheet = pd.DataFrame( data=['%s' % str(s) + '\n'])
#Name_sheet(s)
HeaderList = pd.read_csv(config_path)
column_list = []
for col in HeaderList:
col = col.lstrip("'")
col = col.rstrip("'")
column_list.append(col)
df1 = xl.parse(sheet_list[val], skiprows=i)
df1 = df1.filter(column_list)
df2 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'iMIX')]
if df2.empty:
pass
else:
final3= df2.groupby(['Trial Number', 'iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
#df_filter['sheetaname']=df_sheet(lambda a:'%s' % a['sheetvise'],axis=1)
final = final3.groupby(['iMIX Distribution'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter['packetsize'] = final.apply(lambda z: '%s' % (z['iMIX Distribution']), axis=1)
df_filter['throughput'] = final.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter['latency (us)'] = final.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter['jitter (us)'] = final.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter.to_excel(output1,sheet_name='mani')
output1.save()
df_filter.to_excel(output1, startrow=len(df_filter1)+len(df_filter)+2,sheet_name='mani')
output1.save()
df3 = df1[(df1['Result'] != 'Failed') & (df1['Frame Size Type'] == 'Fixed')]
if df3.empty:
pass
else:
final2 = df3.groupby(['Trial Number', 'Configured Frame Size'], sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
final1=final2.groupby(['Configured Frame Size'],sort=False).apply(lambda x: x.loc[x['Throughput (%)'].idxmax()])
df_filter1['packetsize'] = final1.apply(lambda z: '%s' % (z['Configured Frame Size']), axis=1)
df_filter1['throughput'] = final1.apply(lambda z: '%s' % (z['Throughput (%)']), axis=1)
df_filter1['latency (us)'] = final1.apply(lambda x: '%s/%s/%s' % (x['Minimum Latency (us)'], x['Maximum Latency (us)'], x['Average Latency (us)']),axis=1)
df_filter1['jitter (us)'] = final1.apply(lambda y: '%s/%s/%s' % (y['Minimum Jitter (us)'], y['Maximum Jitter (us)'], y['Average Jitter (us)']),axis=1)
df_filter1.to_excel(output1, sheet_name='mani')
df_filter1.to_excel(output1, startrow=len(df_filter1)+len(df_filter) + 2, sheet_name='mani')
output1.save()
def sheet_every():
for sheet in range(0, sheet_list_lenght):
sheets(sheet)
for file in (ALL_SHEETS):
df_file = pd.DataFrame(data=[file])
workbook = xlrd.open_workbook(file)
wb = load_workbook(file)
xl = pd.ExcelFile(file)
i = 0
sheet_list = workbook.sheet_names()
sheet_list_lenght = (len(sheet_list))
for sheet in sheet_list:
worksheet = workbook.sheet_by_name(sheet)
for i in range(0, worksheet.nrows):
row = worksheet.row_values(i)
if 'Trial Number' in row:``
break
sheet_every()
Not sure if this answers your question or not, but if you want to read from a dataframe and add rows to a new dataframe thorugh a loop you can refer the code below:
dummyData = pd.read_csv("someexcelfile.csv")
#You can merge mutiple dataframes into dummyData and make it a big dataframe
dummyInsertTable = pd.DataFrame(columns=["Col1","Col2","Col3"])
for i in range(len(dummyData)):
dummyInsertTable.loc[i,"Col1"] = dummyData["Col1"][i]
dummyInsertTable.loc[i, "Col2"] = dummyData["Col2"][i]
dummyInsertTable.loc[i, "Col3"] = dummyData["Col3"][i]
dummyInsertTable.to_csv("writeCSVFile.csv")
And next time be precise where you are facing the problem.
EDIT
Try loading the first dataframe and then loop through the other files and append the files in the first dataframe. Refer the code:
import pandas as pd
#Make a list of all the file you have
filesList = ["/home/bhushan/firstFile.csv","/home/bhushan/secondFile.csv","/home/bhushan/thirdFile.csv","/home/bhushan/fourthFile.csv"]
#Read the first csv file using pandas.read_csv
firstFile = pd.read_csv(filesList[0])
#Loop through the rest of the files and append the files in the first DataFrame
for i in range(1,len(filesList)):
fileToBeAdded = pd.read_csv(filesList[i])
firstFile = firstFile.append(fileToBeAdded)
#Write the final file
finalFile = firstFile
finalFile.to_csv("finalFile.csv")
If I get your question correctly, you have two data frames which you want to write to one excel file but you are only getting the last one.
You should write them to two different sheets instead, then you can retrieve them as per requirement, either individually or combined.
Follow the below links for more details and implementation :
https://xlsxwriter.readthedocs.io/example_pandas_multiple.html
https://campus.datacamp.com/courses/importing-managing-financial-data-in-python/importing-stock-listing-data-from-excel?ex=11
Also, you can instead write to a csv file, that is also excel compatible and easier to handle. Also I have observed that it is faster and more space efficient compared to writing to .xlsx file.
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html

Categories