Need some help on python insert to csv.
Would like insert info and data frame to to csv.
After insert info with writerow, when insert data frame into csv, there was missing some header from data frame.
Correct header without : writer.writerow(info)
Wrong data frame header with : writer.writerow(info)
Missing 'No' to 'Billno' from data frame.
df = pd.read_sql(query, cnxn)
info = ['Date From:','',fromdate,'','To',todate]
with open('C:/my_csv/'+reportname+'.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(info)
folder_path = r'C:/my_csv'
file_type = r'/*csv'
files = glob.glob(folder_path + file_type)
max_file = max(files, key=os.path.getctime)
df.to_csv(max_file, index=True, index_label="No", header=True)
Using this answer to a similar question as a template you could try something like:
import pandas as pd
data = {"Index" : [0,1], "A": [1, 1], "B": [2, 2], "C": [3,3]}
df = pd.DataFrame(data)
df.set_index("Index", inplace=True)
date_a = 19022023
date_b = 20022023
f = open('foo', 'a')
f.write(f"Info_1, {date_a}, Info_2, {date_b}\n")
df.to_csv(f, sep=",", header=True)
f.close()
>>> more foo
Info_1, 19022023, Info_2, 20022023
Index,A,B,C
0,1,2,3
1,1,2,3
Related
So here is a sample of my excel layout:
But after merging it has two header and loses the layout.
Here is my code:
import pandas as pd
import glob
path = r"C:/Users//"
fname = glob.glob(path + "/*.xlsx")
result_DFs1 = pd.DataFrame()
result_DFs2 = pd.DataFrame()
for i in fname:
try:
df1 = pd.read_excel(i,sheet_name = "Test1")
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
for i in fname:
try:
df2 = pd.read_excel(i,sheet_name = "Test2")
result_DFs2 = pd.concat([result_DFs2, df2])
except:
pass
with pd.ExcelWriter('pandas_to_excel.xlsx') as writer:
result_DFs1.to_excel (writer, sheet_name='Test1')
result_DFs2.to_excel (writer, sheet_name='Test2')
Is there a way I can just have one header and without losing the excel layout format?
You can keep track of your sheets and only include headers for the first one. Something like:
first = True
for i in fname:
try:
if first:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=0, header=0)
first = False
else:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=1, header=None)
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
A csv file has 90 million rows. One of the Columns in named "State". It has 12 unique values at present. (The count of unique values in the "State" column is dynamic and can change with each csv file.)
I want to split the DataFrame into smaller chunks and then save State-wise files.
The code below is not working.
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=1000000)):
dfs = dict(tuple(chunk.groupby('State')))
for i, df in dfs.items():
df = df.append(df)
df.to_csv("tempcsv/" + i +".csv",sep=",", index = False)
IIUC, Try:
source_path = "DataJune.txt"
from collections import defaultdict
def def_value():
return pd.DataFrame()
# Defining the dict
d = defaultdict(def_value)
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
d[state]=d[state].append(chunk[chunk['State']==state])
for i, df in d.items():
df.to_csv("tempcsv/" + str(i) +".csv",sep=",", index = False)
Another version, based on the #Corralien comment:
source_path = "DataJune.txt"
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
with open("tempcsv/" + str(state) +".csv",mode='a+') as file:
for i, row in chunk[chunk['State']==state].iterrows():
file.write(','.join([str(x) for x in row]))
file.write('\n')
Another version:
source_path = "DataJune.txt"
from os.path import exists
import csv
for i,chunk in enumerate(pd.read_csv(source_path, sep = '|',chunksize=2)):
chunk_states = chunk['State'].unique()
for state in chunk_states:
path = "tempcsv/" + str(state) +".csv"
file_exists = exists(path)
if not file_exists:
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerow(chunk.columns)
print(chunk.columns)
with open(path,newline='',mode='a+') as file:
writer = csv.writer(file)
writer.writerows(chunk[chunk['State']==state].values)
You can use:
import pandas as pd
import os
source_path = 'DataJune.txt'
fps = {}
for chunk in pd.read_csv(source_path, sep='|', chunksize=1000000, dtype=object):
for state, df in chunk.groupby('State'):
# New state, create a new file and write headers
if state not in fps:
fps[state] = open(f'tempcsv/{state}.csv', 'w')
fps[state].write(f"{','.join(df.columns)}{os.linesep}")
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
# Close files properly
for fp in fps.values():
fp.close()
del fps
Update
Try to replace:
# Write data without headers
df.to_csv(fps[state], index=False, header=False)
By
# Write data without headers
g = (row.strip() for row in df.to_csv(index=False, header=None, sep=',').split(os.linesep) if row)
print(*g, sep=os.linesep, file=fps[state])
I am have a code to merge few excel together using Python, but i cant realy rename any thing in that dataframe using df.rename(). could someone explain why? Thanks!
import os
import xlrd
import pandas as pd
def file_name(file_dir):
list=[]
for file in os.listdir(file_dir):
if os.path.splitext(file)[1] == '.xlsx':
list.append(file)
return list
path = r'E:\Sync\External\Test'
wks = file_name(path)
data = []
for i in range(len(wks)):
read_xlsx = xlrd.open_workbook(path + '\\' + wks[i])
sheet1 = read_xlsx.sheets()[1]
nrow = sheet1.nrows
title = sheet1.row_values(0)
location = os.path.splitext(wks[i])[0]
for j in range(6,nrow):
a = sheet1.row_values(j)
a.insert(0,location)
print(a)
data.append(a)
content= pd.DataFrame(data)
content.rename({'0': 'X', '1': 'Y'}, axis=1, inplace=True)
#content.to_csv(path+'\\test.xlsx', sep=',', header=True, index=False)
content.to_excel(path+'\\test.xlsx', header=True, index=False)
Code as above, no error shows,but it's just doesn't work (rename part)
I would like to edit multiple worksheets present in the same Excel File and then save them with the adjustments made. These worksheets have the same columns headers and are called Credit and Debit. The code that I have created is the following:
import pandas as pd
import numpy as np
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_inviato = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in, sheet_name=None, skiprows=4)
****Here is where I am struggling in amending both worksheets at the same time****
# df = df.columns.str.strip()
# df['Col1'] = np.where((df['Col2'] == 'KO') | (df['Col2'] == 'OK'), 0, df['Col1'])
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
for sheet_name in df.keys():
df[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('path....',
'file in....xlsx',
'file out.xlsx')
b.process_file()
found a workaround:
for sheet_name in df.keys():
df[sheet_name] = df[sheet_name].rename(columns=lambda x: x.strip())
df[sheet_name]['Col1'] = np.where((df[sheet_name]['Col2'] == 'KO') |
(df[sheet_name]['Col2'] == 'OK'), 0, df[sheet_name]['Col1'])
Hi I have multiple xlsx files
sales-feb-2014.xlsx
sales-jan-2014.xlsx
sales-mar-2014.xlsx
I have merged all 3 sheets into one data set using file name as INDEX[0]
script :
import pandas as pd
import numpy as np
import glob
import os
all_data = pd.DataFrame()
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index().set_index('filename')
print(df)
Now Data looks like this :
file name col1 col2 col3
sales-jan-2014.xlsx .... .... ...
sales-feb-2014.xlsx .... .... ...
sales-mar-2014.xlsx .... .... ...
here I want to load new xlsx file where I need to load
sales-jan-2014.xlsx into sheet1
sales-feb-2014.xlsx into sheet2
sales-mar-2014.xlsx into sheet3
I have tried with this script :
writer = pd.ExcelWriter('output.xlsx')
for filename in df.index.get_level_values(0).unique():
temp_df = df.xs(filename, level=0)
temp_df.to_excel(writer,filename)
writer.save()
after executing this script i'm getting error :
loc, new_ax = labels.get_loc_level(key, level=level,
AttributeError: 'Index' object has no attribute 'get_loc_level'
can you please suggest where I'm missing
Try using the below code :
import os
import pandas as pd
dirpath = "C:\\Users\\Path\\TO\\Your XLS folder\\data\\"
fileNames = os.listdir(dirpath)
writer = pd.ExcelWriter(dirpath+'combined.xlsx', engine='xlsxwriter')
for fname in fileNames:
df = pd.read_excel(dirpath+fname)
print(df)
df.to_excel(writer, sheet_name=fname)
writer.save()
You can also use your code by make below changes :
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index()
print(df.columns)
df.set_index(['filename','index'], inplace=True)
and saving it as you have done.
I hope this helps