I am have a code to merge few excel together using Python, but i cant realy rename any thing in that dataframe using df.rename(). could someone explain why? Thanks!
import os
import xlrd
import pandas as pd
def file_name(file_dir):
list=[]
for file in os.listdir(file_dir):
if os.path.splitext(file)[1] == '.xlsx':
list.append(file)
return list
path = r'E:\Sync\External\Test'
wks = file_name(path)
data = []
for i in range(len(wks)):
read_xlsx = xlrd.open_workbook(path + '\\' + wks[i])
sheet1 = read_xlsx.sheets()[1]
nrow = sheet1.nrows
title = sheet1.row_values(0)
location = os.path.splitext(wks[i])[0]
for j in range(6,nrow):
a = sheet1.row_values(j)
a.insert(0,location)
print(a)
data.append(a)
content= pd.DataFrame(data)
content.rename({'0': 'X', '1': 'Y'}, axis=1, inplace=True)
#content.to_csv(path+'\\test.xlsx', sep=',', header=True, index=False)
content.to_excel(path+'\\test.xlsx', header=True, index=False)
Code as above, no error shows,but it's just doesn't work (rename part)
Related
So here is a sample of my excel layout:
But after merging it has two header and loses the layout.
Here is my code:
import pandas as pd
import glob
path = r"C:/Users//"
fname = glob.glob(path + "/*.xlsx")
result_DFs1 = pd.DataFrame()
result_DFs2 = pd.DataFrame()
for i in fname:
try:
df1 = pd.read_excel(i,sheet_name = "Test1")
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
for i in fname:
try:
df2 = pd.read_excel(i,sheet_name = "Test2")
result_DFs2 = pd.concat([result_DFs2, df2])
except:
pass
with pd.ExcelWriter('pandas_to_excel.xlsx') as writer:
result_DFs1.to_excel (writer, sheet_name='Test1')
result_DFs2.to_excel (writer, sheet_name='Test2')
Is there a way I can just have one header and without losing the excel layout format?
You can keep track of your sheets and only include headers for the first one. Something like:
first = True
for i in fname:
try:
if first:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=0, header=0)
first = False
else:
df1 = pd.read_excel(i,sheet_name = "Test1", skiprows=1, header=None)
result_DFs1 = pd.concat([result_DFs1, df1])
except:
pass
I am tring to make changes to the multiple excel files. I have below program to loop through multiple files. I strangle get this error NameError: name 'df2' is not defined. Can someone help me to understand what is the issue? Thank you in advance!
import os
import glob
import pandas as pd
from pathlib import Path
folder = (r"C:\Users\Documents\Extracted")
for file_name in Path(folder).glob('*.xlsx'):
df = pd.read_excel(file_name)
drop_list = ['BarrierFreeAttributes.BarrierFreeAttribute','ConsultationHours.ConsultationHoursTimeSpan', 'Location.Coordinates.Latitude_right', 'Location.Coordinates.Longitude_right']
if all(item in drop_list for item in list(df.columns)):
df2 = df.drop(columns=drop_list, axis=1)
else:
print(file_name)
df2.to_excel(file_name.with_suffix('.xlsx'),index = False)
Data:
Reason is in else is not defined df2, simpliest is use df only and overwrite it in if statement, also writing to file is removed from else statement:
for file_name in Path(folder).glob('*.xlsx'):
df = pd.read_excel(file_name)
drop_list = ['BarrierFreeAttributes.BarrierFreeAttribute','ConsultationHours.ConsultationHoursTimeSpan', 'Location.Coordinates.Latitude_right', 'Location.Coordinates.Longitude_right']
if all(item in drop_list for item in list(df.columns)):
df = df.drop(columns=drop_list, axis=1)
else:
print(file_name)
df.to_excel(file_name.with_suffix('.xlsx'),index = False)
I would like to edit multiple worksheets present in the same Excel File and then save them with the adjustments made. These worksheets have the same columns headers and are called Credit and Debit. The code that I have created is the following:
import pandas as pd
import numpy as np
class blah:
def __init__(self, path, file_in, file_out):
self.path = path
self.file_inviato = file_in
self.file_out = file_out
def process_file(self):
df = pd.read_excel(self.path + self.file_in, sheet_name=None, skiprows=4)
****Here is where I am struggling in amending both worksheets at the same time****
# df = df.columns.str.strip()
# df['Col1'] = np.where((df['Col2'] == 'KO') | (df['Col2'] == 'OK'), 0, df['Col1'])
writer = pd.ExcelWriter(self.path + self.file_out, engine='xlsxwriter')
for sheet_name in df.keys():
df[sheet_name].to_excel(writer, sheet_name=sheet_name, index=False)
writer.save()
b = blah('path....',
'file in....xlsx',
'file out.xlsx')
b.process_file()
found a workaround:
for sheet_name in df.keys():
df[sheet_name] = df[sheet_name].rename(columns=lambda x: x.strip())
df[sheet_name]['Col1'] = np.where((df[sheet_name]['Col2'] == 'KO') |
(df[sheet_name]['Col2'] == 'OK'), 0, df[sheet_name]['Col1'])
Hi I have multiple xlsx files
sales-feb-2014.xlsx
sales-jan-2014.xlsx
sales-mar-2014.xlsx
I have merged all 3 sheets into one data set using file name as INDEX[0]
script :
import pandas as pd
import numpy as np
import glob
import os
all_data = pd.DataFrame()
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index().set_index('filename')
print(df)
Now Data looks like this :
file name col1 col2 col3
sales-jan-2014.xlsx .... .... ...
sales-feb-2014.xlsx .... .... ...
sales-mar-2014.xlsx .... .... ...
here I want to load new xlsx file where I need to load
sales-jan-2014.xlsx into sheet1
sales-feb-2014.xlsx into sheet2
sales-mar-2014.xlsx into sheet3
I have tried with this script :
writer = pd.ExcelWriter('output.xlsx')
for filename in df.index.get_level_values(0).unique():
temp_df = df.xs(filename, level=0)
temp_df.to_excel(writer,filename)
writer.save()
after executing this script i'm getting error :
loc, new_ax = labels.get_loc_level(key, level=level,
AttributeError: 'Index' object has no attribute 'get_loc_level'
can you please suggest where I'm missing
Try using the below code :
import os
import pandas as pd
dirpath = "C:\\Users\\Path\\TO\\Your XLS folder\\data\\"
fileNames = os.listdir(dirpath)
writer = pd.ExcelWriter(dirpath+'combined.xlsx', engine='xlsxwriter')
for fname in fileNames:
df = pd.read_excel(dirpath+fname)
print(df)
df.to_excel(writer, sheet_name=fname)
writer.save()
You can also use your code by make below changes :
for f in glob.glob(r'H:\Learning\files\sales*.xlsx'):
df = pd.read_excel(f)
df['filename'] = os.path.basename(f)
df = df.reset_index()
print(df.columns)
df.set_index(['filename','index'], inplace=True)
and saving it as you have done.
I hope this helps
can you teach me whether Python can write into a same Excel file, but 2 different spreadsheets (tabs)?
Just for example, I want to pick and write the titles of below 4 websites, and write them into the same file title.xls but respectively in its Sheet1 and Sheet 2.
www.dailynews.com
www.dailynews.co.zw
www.gulf-daily-news.com
www.dailynews.gov.bw
I do them in 2 scripts, each for 2 websites:
from bs4 import BeautifulSoup
import urllib2
import xlwt
line_in_list = ['www.dailynews.com','www.dailynews.co.zw']
# line_in_list = [www.gulf-daily-news.com','www.dailynews.gov.bw']
book = xlwt.Workbook(encoding='utf-8', style_compression = 0)
sheet = book.add_sheet('Sheet1', cell_overwrite_ok = True)
# sheet = book.add_sheet('Sheet2', cell_overwrite_ok = True)
for cor,websites in enumerate(line_in_list):
url = "http://" + websites
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read())
site_title = soup.find_all("title")
print site_title
sheet.write (cor, 0, site_title[0].text)
book.save("title.xls")
however, the script is overwriting the sheets. I can only have either Sheet1 or Sheet2 but never both.
any helps? thanks.
You can also do it using pandas.
import pandas as pd
# Add your data in list, which may contain a dictionary with the name of the
# columns as the key
df1 = pd.DataFrame({'website': ['www.dailynews.com', 'www.dailynews.co.zw']})
df2 = pd.DataFrame({'website': ['www.gulf-daily-news.com', 'www.dailynews.gov.bw']})
# Create a new excel workbook
writer = pd.ExcelWriter('title.xlsx', engine='xlsxwriter')
# Write each dataframe to a different worksheet.
df1.to_excel(writer, sheet_name='Sheet1')
df2.to_excel(writer, sheet_name='Sheet2')
# Save workbook
writer.close()
If I correctly understood what you need. Sorry, can't comment to make it more clear.
sheet1 = book.add_sheet('Sheet1', cell_overwrite_ok = True)
sheet2 = book.add_sheet('Sheet2', cell_overwrite_ok = True)
sheet1.write (cor, 0, site_title[0].text)
sheet2.write (cor, 0, site_title[0].text)
import numpy as np
import pandas as pd
# Create a Dataframe
df1 = pd.DataFrame(np.random.rand(100).reshape(50,2),columns=['a','b'])
df2 = pd.DataFrame(np.random.rand(100).reshape(50,2),columns=['a','b'])
# Excel path
excelpath = 'path_to_your_excel.xlsx'
# Write your dataframes to difference sheets
with pd.ExcelWriter(excelpath) as writer:
df1.to_excel(writer,sheet_name='Sheet1')
df2.to_excel(writer,sheet_name = 'Sheet2')
""" I noticed that the above script overwrite all existing columns of in
the excel. In case you want to keep some columns and sheet untouched,
you might consider doing it the following way"""
import pandas as pd
import numpy as np
from openpyxl import load_workbook
book = load_workbook(excelpath)
writer = pandas.ExcelWriter(excelpath, engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
df1.to_excel(writer, "Sheet1", columns=['a', 'b']) # only columns 'a' and 'b' will be populated
df2.to_excel(writer,"Sheet2",columns=['a','b']) # only columns 'a' and 'b' will be populated
writer.save()
--Append Excel Data Sheet to Spreadsheet
import pandas as pd
#import os
#from pandasql import sqldf
#pysqldf = lambda q: sqldf(q, globals())
df1 = pd.read_csv('MyData1.csv')
df2 = pd.read_csv('MyData2.csv')
print(df1)
print(df2)
Differences_df = df1.merge(df2, indicator=True, how='outer')
#Differences_df[merged['_merge'] == 'right_only']
print(Differences_df)
with pd.ExcelWriter('MyInputData.xlsx', mode='a') as writer:
Differences_df.to_excel(writer, sheet_name='Diff')
print("Spreadsheets Processed")