openpyxl: Formulas read as blanks in some (key use) cases - python

My code downloads a .xlsx file from google drive (using pydrive), finds some blank cells with pandas, and fills in those blank cells with openpyxl.
When I open the openpyxl altered file, everything looks great. However, when I use the pandas read_excel function, all cells that have equations are read as blanks. I suspect the issue is with openpyxl because when I preview the file on drive, those cells are blank. There is no issue with a file that openpyxl hasn't touched.
It looks like my issue is very similar to this one, but since my objective is just to leave the formulas untouched (I only want to fill blank cells), I don't really want to parse the formulas and I'm not really sure how or if to apply Felipe's fix.
I'd like to be able to download the file to plot it with bokeh, and users and python will both be editing the program, so I'd really like pandas to be able to read the equations whether its a user modified file or an openpyxl modified file. The equations in the file were click and drag "shared equations", and I'd like to keep it that way if possible so ideally I'd like to avoid using data_only=True. I tried specifying data_only=False, but this didn't appear to change anything.
I'm using openpyxl 2.3.5 2.4, and I keep excel closed when code is running.
Versions of the file before and after openpyxl modification are available here.
My code is here, all openpyxl code is isolated to :
# Import Libraries
import datetime
import imp
import os
import pandas as pd
from openpyxl import load_workbook
from itertools import islice
# Relative imports for bokeh interaction
dl = imp.load_source('downloader', os.getcwd() +
'/Project/downloader.py')
gdu = imp.load_source('googledriveutils', os.getcwd() +
'/Project/googledriveutils.py')
remove_file = gdu.remove_file
find_folderid = gdu.find_folderid
get_file_list = gdu.get_file_list
# Define constants
COL_LABEL = '\nProbe - '
# TODO: ORP PROBE: REVISE THIS DATE when orp probe is added
IGNORE_BEFORE = pd.to_datetime('5.24.2016')
PROBE_DICT = {'DO (mg/L)': 'DO mg/L',
'pH': 'pH',
'NH4+ (mgN/L)': 'Ammonium',
'ORP (mV)': 'ORP mV'}
TS = '\nTimestamps'
def save_to_workbook(newval,
date,
header,
rows_to_skip=12,
wbname='temp.xlsx',
sheet_name='Reactor Data'):
wb = load_workbook(wbname)
ws = wb[sheet_name]
for cell in ws[rows_to_skip+1]:
# TODO: Error if header isn't found
if cell.value == header:
colno = cell.col_idx
break
for row in ws.iter_rows(min_row=rows_to_skip+1, min_col=1, max_col=1):
for cell in row:
# TODO: Error if date isn't found
if cell.value == date:
rowno = cell.row
break
ws.cell(row=rowno, column=colno).value = newval
wb.save(wbname)
return df
def find_r1masterfile():
# Navigate through the directories
wlab_fid = find_folderid('Winkler Lab', 'root')
kp_fid = find_folderid('KathrynsProjects', wlab_fid)
amxrct_fid = find_folderid('Anammox Reactor', kp_fid)
trials_fid = find_folderid('Reactor Trials', amxrct_fid)
# List files in directory
file_list = get_file_list(trials_fid)
for afile in file_list:
if afile['title'] == 'AMX RCT.xlsx':
# Return the file we asked for
return afile
# TODO: error if there was no file with that name
def save_r1masterfile(csv, rows_to_skip=12, filename='temp.xlsx', sheet_name='Reactor Data'):
# Get the file we want
master_file = find_r1masterfile()
try:
master_file.GetContentFile(filename)
except Exception, e:
print "Warning: Something wrong with file R1 Master File."
print str(e)
# TODO: add an email alarm to responsible user
if csv:
return master_file
else:
# convert to dataframe
wb = load_workbook(filename, data_only=True)
ws = wb[sheet_name]
print ws["B14"].value
data = ws.values
data = list(data)[rows_to_skip:]
cols = list(data[0])
del cols[0]
del data[0]
idx = [r[0] for r in data]
data = (islice(r, 1, None) for r in data)
df = pd.DataFrame(data, index=idx, columns=cols)
print df.dropna(how='all')
remove_file(filename)
return df
def upload_r1masterfile(filename='temp.xlsx'):
# Get the file we want
master_file = find_r1masterfile()
try:
master_file.SetContentFile(filename)
master_file.Upload()
except Exception, e:
print "Warning: Something wrong with file R1 Master File."
print str(e)
# TODO: add an email alarm to responsible user
def populate_r1masterfile(rows_to_skip=12, filename='temp.xlsx'):
# Get the R1 master file as a file
save_r1masterfile(True)
# Convert the juicy stuff to a dataframe
masterdf = pd.read_excel(filename,
sheetname='Reactor Data',
encoding="utf-16",
skiprows=rows_to_skip,
sep='\t',
index_col='Date',
keep_default_na=False,
na_values=['-1.#IND', '1.#QNAN', '1.#IND',
'-1.#QNAN', '','N/A', '#NA', 'NA'
'NULL', 'NaN', '-NaN', 'nan', '-nan'])
# Find what we will populate with probe data
# Find timestamps
ts_columns = [col for col in masterdf.columns if TS in col]
tsdf = masterdf[ts_columns]
# Find probes, ignore before given date
probe_columns = [col for col in masterdf.columns if COL_LABEL in col]
probedf = masterdf[probe_columns]
probedf = probedf[masterdf.index > IGNORE_BEFORE]
# Find Indices and column labels of blank values
stackdf = probedf.stack(dropna=False)
empty = stackdf[stackdf.isnull()].index.tolist()
# For each blank look for the probe, time & date of cycle, and return val
for each in empty:
probe, time = each[1].split(COL_LABEL)
time = tsdf.loc[each[0], time+TS]
ts = each[0]+pd.DateOffset(hour=time.hour, minute=time.minute)
val = dl.get_val_from(1, ts, PROBE_DICT.get(probe))
probedf.set_value(each[0], each[1], val)
# Save that value to the workbook
save_to_workbook(val, each[0], each[1])
upload_r1masterfile()
print 'Master file updated. ' + str(datetime.datetime.now())
remove_file('temp.xlsx')
return probedf
UPDATE
I modified my code as per Charlie's suggestions (updated above). But I'm still getting Nones in the dataframe. To provide a more specific example, why is it when I run this code:
from openpyxl import load_workbook
wb = load_workbook('AMX RCT mod.xlsx', data_only=True)
ws = wb['Reactor Data']
print 'Value of B14 Formula is: ' + str(ws["B14"].value)
With this file, I get back?:
Value of B14 Formula is: None
Is there a workaround?

Using openpyxl 2.4 you might be able to do what you need in a single pass. I've taken your first function and adapted it.
from itertools import islice
from pandas import DataFrame
def save_to_workbook(newval,
date,
header,
rows_to_skip=12,
wbname='temp.xlsx',
sheet_name='Reactor Data'):
wb = load_workbook(wbname)
ws = wb[sheet_name]
rowno = None
colno = None
for cell in ws[1]:
# TODO: Error if header isn't found
if cell.value == header:
colno = col
for row in ws.iter_rows(min_row=rows_to_skip+1, min_col=1, max_col=1):
for cell in row:
# TODO: Error if date isn't found
if cell.value == date:
rowno = row
break
# TODO: Fix this
ws.cell(row=rowno, column=colno).value = newval
# convert to dataframe
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (islice(r, 1, None) for r in data)
df = DataFrame(data, index=idx, columns=cols)
return df
This probably doesn't do everything you want but will hopefully get you started. It also avoids saving and parsing a whole workbook which could make it quite a bit faster.
To work with openpyxl 2.4 you will either need to do pip install -U --pre openpyxl or work with a checkout.
For further information on using openpyxl and pandas together please see the documentation.

Charlie's Answer from the mailing list:
So, if you want to keep the formulae then you must not use data only mode.
As previously stated, openpyxl never evaluates formulae so if you want to
know value of A3 you must pass the file to an application such as Excel
or OpenOffice — you can run OpenOffice headless for this kind of thing or
use xlwings for Excel — that does do formula evaluation. You could then
read this file in data only mode to see the result of the calculation.
Alternatively you could try using something like PyCel to do the
evaluation for you. But, basically if you want to do calculations: do them
in Python.
As per his suggestion, my workaround was to redo all the calculations column by column as they are done in the excel file. I.E. For excel File like this:
col1 col2 col3 col4
row1 1 3 =A1+B1 =1+3
row2 2 4 =A2+B2 =2+4
I import it as a dataframe like this (to maintain equations as a string):
wb = load_workbook(filename, data_only=False)
ws = wb[sheet_name]
data = ws.values
cols = next(data)[1:]
data = list(data)
idx = [r[0] for r in data]
data = (islice(r, 1, None) for r in data)
df = DataFrame(data, index=idx, columns=cols)
And then do the following:
parse_excel = lambda x: eval(str(x)[1:]) if isinstance(x, str) else x
for col in df.columns:
try:
df[col] = df[col].map(parse_excel)
except:
pass
df['col3'] = df['col2']+df['col1']
I'm certain this is maybe the clumsiest way to do this, but it works for now.

Related

How can I highlight in one color the cells that changed for a same row between two dataframes and in another color what is new? Different Shape

I tried many functions and tried to apply existing solutions to get the output I want, yet I seems not to be able to get an excel output at the end that keeps the formatting I try to apply.
It seems that all the function existing in pandas uses only identically labelled indexes, or files of the same shape, in my situation the shape of the two files are (757,26) for let's say file1 and (688,39) for file 2, the first 26 columns are labelled the same way for file1 and file2.
is there a way to merge these two files, highlight the differences as indicated in the title, and create an excel output with the formatting still present?
Here is what I tried:
import pandas as pd
import numpy as np
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl import Workbook
import pandas.io.formats.style as style
dfcurr=pd.read_excel(r'IPH4201P2 - DRATracker_Current.xlsx')
dfprev=pd.read_excel(r'IPH4201P2 - DRATracker_Previous.xlsx')
dfprev=dfprev.loc[(dfprev['Subject'].str.contains('M'))|(dfprev['Subject'].str.contains('S'))]
dfprev=dfprev.reset_index()
dfprev=dfprev.drop(columns='index')
df_diff=pd.merge(dfcurr,dfprev,how='left',indicator=True)
common_columns = df_diff.columns.intersection(dfprev.columns)
compare_df = df_diff[common_columns].eq(dfprev[common_columns])
compare_df.to_excel('comp.xlsx')
# Convert dataframe to string
df_diff = df_diff.astype(str)
def highlight_diff(data, compare):
if type(data) != pd.DataFrame:
data = pd.DataFrame(data)
if type(compare) != pd.DataFrame:
compare = pd.DataFrame(compare)
result = []
for col in data.columns:
if col in compare.columns and (data[col] != compare[col]).any():
result.append('background-color: #DAEEF3')
elif col not in compare.columns:
result.append('background-color: #E4DFEC')
else:
result.append('background-color: white')
return result
# Create a new workbook and add a new worksheet
wb = Workbook()
ws = wb.active
# Write the dataframe to the worksheet
for r in dataframe_to_rows(df_diff.style.apply(highlight_diff, compare=compare_df).data, index=False, header=True):
ws.append(r)
# Save the workbook
wb.save('Merged_style.xlsx')
However, I do not get an output with the style applied; no cells are highlighted in the color I want them to be highlighted in.
Edit:
I tried a different approach to highlight the cells in the excel, the function used for this approach comes from here:
import pandas as pd
import numpy as np
import openpyxl
import pandas.io.formats.style as style
dfcurr=pd.read_excel(r'IPH4201P2 - DRATracker_Current.xlsx')
dfprev=pd.read_excel(r'IPH4201P2 - DRATracker_Previous.xlsx')
dfprev=dfprev.loc[(dfprev['Subject'].str.contains('M'))|(dfprev['Subject'].str.contains('S'))]
dfprev=dfprev.reset_index()
dfprev=dfprev.drop(columns='index')
new='background-color: #DAEEF3'
change='background-color: #E4DFEC'
df_diff=pd.merge(dfcurr,dfprev,on=['Subject','Visit','Visit Date','Site\nID','Cohort','Pathology','Clinical\nStage At\nScreening','TNMBA at\nScreening'],how='left',indicator=True)
for col in df_diff.columns:
if '_y' in col:
del df_diff[col]
elif 'Unnamed: 1' in col:
del df_diff[col]
elif '_x' in col:
df_diff.columns=df_diff.columns.str.rstrip('_x')
def highlight_diff(data, other, color='#DAEEF3'):
# Define html attribute
attr = 'background-color: {}'.format(color)
# Where data != other set attribute
return pd.DataFrame(np.where(data.ne(other), attr, ''),
index=data.index, columns=data.columns)
# Set axis=None so it passes the entire frame
df_diff=df_diff.style.apply(highlight_diff, axis=None, other=dfprev)
print(type(df_diff))
df_diff.to_excel('Diff.xlsx',engine='openpyxl',index=0)
This new method provides me with an excel file where the style is applied, how can I update it to apply the color #DAEEF3 to rows in df_diff where if the Subject, Visit and Visit Date are not present in the dataframe dfprev, and apply the color #E4DFEC to cells that differs between the two files for matching Subject, Visit and Visit Date?
This code is isn't doing anything...
df_diff.style.apply(highlight_diff, compare=compare_df).data
df_diff.style creates a Styler object.
.apply applies a function to that Styler for it to attach relevant HTML styling which it stores as a mapping in the Styler context.
.data the just retrieves the original DataFrame object that you created the Styler object with and it has nothing to do with those HTML styling contexts you created for the Styler, so you are effectively discarding them with this final .data addition.
Styler has its own to_excel method which interprets some of that HTML styling context and converts it to Excel cell coloring and formatting.
After asking around to people I know that had to do something similar, here is the final code that produces the expected output:
import pandas as pd
from openpyxl.styles import PatternFill
from openpyxl import load_workbook
#DATA FILES##################################################
#Set below to False to copy comments manually and keep the comment formatting.
copy_comments_automatically = True
#Update folderPath
folderPath = "C:/Users/G.Tielemans/OneDrive - Medpace/Documents/Innate/Script/DRA/"
#File names must match and files must be closed when running
dfcurr = pd.read_excel(folderPath + "IPH4201P2 - DRATracker_Current.xlsx")
dfprev = pd.read_excel(folderPath + "IPH4201P2 - DRATracker_Previous.xlsx")
#############################################################
#LOADING DATA################################################
dfprev = dfprev.loc[(dfprev['Subject'].str.contains('M'))|(dfprev['Subject'].str.contains('S'))]
dfprev = dfprev.reset_index()
dfprev = dfprev.drop(columns='index')
dfprevComments = dfprev.iloc[:, 29:]
#############################################################
#NEW LINES###################################################
def highlightNewLines(linecurr):
currSubject = linecurr["Subject"]
currVisit = linecurr["Visit"]
currVisitDate = linecurr["Visit Date"]
for index, row in dfprev.iterrows():
if currSubject == row["Subject"] and currVisit == row["Visit"] and currVisitDate == row["Visit Date"]:
return True
return False
dfcurr["Duplicate?"] = dfcurr.apply(lambda row: highlightNewLines(row), axis = 1)
#############################################################
#FIND UPDATES################################################
dfDupes = dfcurr[dfcurr["Duplicate?"] == True]
dfDupeslen = len(dfDupes)
#indexes of new lines to paste at bottom of file and color
indexes = dfcurr[dfcurr["Duplicate?"] == False].index
dfDupes = dfDupes.drop("Duplicate?", axis = 1)
dfDupes = dfDupes.reset_index(drop=True)
dfprev = dfprev.iloc[:,0:26]
dfprev = dfprev.reset_index(drop=True)
difference = dfDupes[dfDupes!=dfprev]
#############################################################
#ATTACH NEW FINDINGS AND PASTE MEDPACE COMMENT COLUMNS#######
newfindings = dfcurr.loc[indexes]
newfindings = newfindings.drop("Duplicate?", axis = 1)
dfDupes = pd.concat([dfDupes, newfindings])
dfDupes = dfDupes.reset_index(drop=True)
dflen = len(dfDupes)
if copy_comments_automatically:
dfDupes = pd.concat([dfDupes, dfprevComments], axis=1)
#############################################################
#COLORING####################################################
dfDupes.to_excel(folderPath + "IPH4201P2 - DRATracker_Output.xlsx", index = False)
wb = load_workbook(folderPath + "IPH4201P2 - DRATracker_Output.xlsx")
ws = wb.active
fillred = PatternFill(start_color="ffc7ce", end_color="ffc7ce", fill_type = "solid")
fillblue = PatternFill(start_color="99ccff", end_color="99ccff", fill_type = "solid")
for row in range(len(difference)):
for column in range(len(difference.columns)):
if pd.isnull(difference.iloc[row, column]) == False:
ws.cell(row+2, column+1).fill = fillred
for row in range(dfDupeslen, dflen):
for column in [2,5,6]:
ws.cell(row+2, column).fill = fillblue
wb.save(folderPath + "IPH4201P2 - DRATracker_Output.xlsx")
#############################################################
print("Done")

python excel subtract with 2 worksheet

Is it possible to create a python script to automatic which is subtract cell value with 2 worksheet in one excel file?
I have checked some documents, and seem that use the method of pandas or openpyxl to do so. But I can't to do that. Do you have any suggestion to me? Many thanks.
Script:
from datetime import datetime
import pandas as pd
import openpyxl as xl;
currDateTime = datetime.now()
Sheet1 ="C:\\Users\\peter\\Downloads\\" + currDateTime.strftime('%Y%m%d') + "\\5250A" + "\\5250A.xlsx"
wb3 = xl.load_workbook(Sheet1)
ws3 = wb3.worksheets[0]
wb4 = xl.load_workbook(Sheet1)
ws4 = wb4.worksheets[1]
wb5 = xl.load_workbook(Sheet1)
ws5 = wb5.create_sheet("Done")
wb4.subtract(wb3)
wb5.save(str(Sheet1))
Expected Result:
Do so in excel coule be way easier I think. There could be a smarter way to write this code.
[NOTE] I just do the subsctraction cell by cell, so if there's any mismatch like same row but different dept.id or same col but different item will make errors. If you may meet this situation, you'll have a change some in the following code.
import openpyxl as xl
def get_row_values(worksheet):
"""
return data structure:
[
[A1, B1, C1, ...],
[A2, B2, C2, ...],
...
]
"""
result = []
for i in worksheet.rows:
row_data = []
for j in i:
row_data.append(j.value)
result.append(row_data)
return result
if __name__ == '__main__':
# load excel file
wb = xl.load_workbook('test1.xlsx')
ws1 = wb.worksheets[0]
ws2 = wb.worksheets[1]
# get data from the first 2 worksheets
ws1_rows = get_row_values(ws1)
ws2_rows = get_row_values(ws2)
# calculate and make a new sheet
ws_new = wb.create_sheet('Done')
# insert header
ws_new.append(ws1_rows[0])
for row in range(1, len(ws1_rows)):
# do the substract cell by cell
row_data = []
for column, value in enumerate(ws1_rows[row]):
if column == 0:
# insert first column
row_data.append(value)
else:
if ws1_rows[row][0] == ws2_rows[row][0]:
# process only when first column match
row_data.append(value - ws2_rows[row][column])
ws_new.append(row_data)
wb.save('test2.xlsx')
here's my sample excel file
first sheet:
second sheet:
generated sheet:

How can I concatenate multiple rows of excel data into one?

I'm currently facing an issue where I need to bring all of the data shown in the images below into one line only.
So using Python and Openpyxl, I tried to write a parsing script that reads the line and only copies when values are non-null or non-identical, into a new workbook.
I get out of range errors, and the code does not keep just the data I want. I've spent multiple hours on it, so I thought I would ask here to see if I can get unstuck.
I've read some documentation on Openpyxl and about making lists in python, tried a couple of videos on youtube, but none of them did exactly what I was trying to achieve.
import openpyxl
from openpyxl import Workbook
path = "sample.xlsx"
wb = openpyxl.load_workbook(path)
ws = wb.active
path2 = "output.xlsx"
wb2 = Workbook()
ws2 = wb2.active
listab = []
rows = ws.max_row
columns = ws.max_column
for i in range (1, rows+1):
listab.append([])
cellValue = " "
prevCell = " "
for c in range (1, rows+1):
for r in range(1, columns+1):
cellValue = ws.cell(row=r, column=c).value
if cellValue == prevCell:
listab[r-1].append(prevCell)
elif cellValue == "NULL":
listab[r-1].append(prevCell)
elif cellValue != prevCell:
listab[r-1].append(cellValue)
prevCell = cellValue
for r in range(1, rows+1):
for c in range (1, columns+1):
j = ws2.cell(row = r, column=c)
j.value = listab[r-1][c-1]
print(listab)
wb2.save("output.xlsx")
There should be one line with the below information:
ods_service_id | service_name| service_plan_name| CPU | RAM | NIC | DRIVE |
Personally I would go with pandas.
import pandas as pd
#Loading into pandas
df_data = pd.read_excel('sample.xlsx')
df_data.fillna("NO DATA",inplace=True) ## Replaced nan values with "NO DATA"
unique_ids = df_data.ods_service_ids.unique()
#Storing pd into a list
records_list = df_data.to_dict('records')
keys_to_check = ['service_name', 'service_plan_name', 'CPU','RAM','NIC','DRIVE']
processed = {}
#Go through unique ids
for key in unique_ids:
processed[key] = {}
#Get related records
matching_records = [y for y in records_list if y['ods_service_ids'] == key]
#Loop through records
for record in matching_records:
#For each key to check, save in dict if non null
processed[key]['ods_service_ids'] = key
for detail_key in keys_to_check:
if record[detail_key] != "NO DATA" :
processed[key][detail_key] = record[detail_key]
##Note : doesn't handle duplicate values for different keys so far
#Records are put back in list
output_data = [processed[x] for x in processed.keys()]
# -> to Pandas
df = pd.DataFrame(output_data)[['ods_service_ids','service_name', 'service_plan_name', 'CPU','RAM','NIC','DRIVE']]
#Export to Excel
df.to_excel("output.xlsx",sheet_name='Sheet_name_1', index=False)
The above should work but I wasn't really sure on how you wanted to save duplicated records for the same id. Do you look to store them as DRIVE_0, DRIVE_1, DRIVE_2 ?
EDIT:
df could be exported in a different way. Replaced below #export to Excel with the following :
df.to_excel("output.xlsx",sheet_name='Sheet_name_1')
EDIT 2:
with no input data it was hard to see any flows. Corrected the code above with fake data
To be honest, I think you've managed to get confused by data structures and come up with something far more complicated than you need.
One approach that would suit would be to use Python dictionaries for each service, updating them row by row.
wb = load_workbook("sample.xlsx")
ws = wb.active
objs = {}
headers = next(ws.iter_rows(min_row=1, max_row=1, values_only=True))
for row in ws.iter_rows(min_row=2, values_only=True):
if row[0] not in objs:
obj = {key:value for key, value in zip(headers, row)}
objs[obj['ods_service_id']] = obj
else:# update dict with non-None values
extra = {key:value for key, value in zip(headers[3:], row[3:]) if value != "NULL"}
obj.update(extra)
# write to new workbook
wb2 = Workbook()
ws2 = wb2.active
ws2.append(headers)
for row in objs.values(): # do they need sorting?
ws2.append([obj[key] for key in headers])
Note how you can do everything without using counters.

Looping through a folder to merge several excel sheets into one column

I have several workbooks, each with three sheets. I want to loop through each workbook and merge all the data from sheet_1 into a new workbook_1 file, sheet_2 into workbook_2 file & sheet_3 into workbook_3.
As far as I can tell the script below does everything I need, except rather than appending the data, it overwrites the data from the previous iteration.
For the sake of parsimony I've shortened, cleaned & simplified my script, but I'm happy to share the full script if needed.
import pandas as pd
import glob
search_dir= ('/Users/PATH/*.xlsx')
sheet_names = ['sheet_1','sheet_2','sheet_2']
def a_joiner(sheet):
for loop_x in glob.glob(search_dir):
try:
if sheet == 'sheet_1':
id_file= pd.ExcelFile(loop_x)
df_1 = id_file.parse(sheet, header= None)
writer= pd.ExcelWriter('/Users/PATH/%s.xlsx' %(sheet), engine= 'xlsxwriter')
df_1.to_excel(writer)
writer.save()
elif sheet == 'sheet_2':
#do same as above
else:
#and do same as above again
except Exception as e:
print('Error:',e)
for sheet in sheet_names:
a_joiner(sheet)
You can also easilly append data like:
df = []
for f in ['c:\\file1.xls', 'c:\\ file2.xls']:
data = pd.read_excel(f, 'Sheet1').iloc[:-2]
data.index = [os.path.basename(f)] * len(data)
df.append(data)
df = pd.concat(df)
From:
Using pandas Combining/merging 2 different Excel files/sheets

How can I open an Excel file in Python?

How do I open a file that is an Excel file for reading in Python?
I've opened text files, for example, sometextfile.txt with the reading command. How do I do that for an Excel file?
Edit:
In the newer version of pandas, you can pass the sheet name as a parameter.
file_name = # path to file + file name
sheet = # sheet name or sheet number or list of sheet numbers and names
import pandas as pd
df = pd.read_excel(io=file_name, sheet_name=sheet)
print(df.head(5)) # print first 5 rows of the dataframe
Check the docs for examples on how to pass sheet_name: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
Old version:
you can use pandas package as well....
When you are working with an excel file with multiple sheets, you can use:
import pandas as pd
xl = pd.ExcelFile(path + filename)
xl.sheet_names
>>> [u'Sheet1', u'Sheet2', u'Sheet3']
df = xl.parse("Sheet1")
df.head()
df.head() will print first 5 rows of your Excel file
If you're working with an Excel file with a single sheet, you can simply use:
import pandas as pd
df = pd.read_excel(path + filename)
print df.head()
Try the xlrd library.
[Edit] - from what I can see from your comment, something like the snippet below might do the trick. I'm assuming here that you're just searching one column for the word 'john', but you could add more or make this into a more generic function.
from xlrd import open_workbook
book = open_workbook('simple.xls',on_demand=True)
for name in book.sheet_names():
if name.endswith('2'):
sheet = book.sheet_by_name(name)
# Attempt to find a matching row (search the first column for 'john')
rowIndex = -1
for cell in sheet.col(0): #
if 'john' in cell.value:
break
# If we found the row, print it
if row != -1:
cells = sheet.row(row)
for cell in cells:
print cell.value
book.unload_sheet(name)
This isn't as straightforward as opening a plain text file and will require some sort of external module since nothing is built-in to do this. Here are some options:
http://www.python-excel.org/
If possible, you may want to consider exporting the excel spreadsheet as a CSV file and then using the built-in python csv module to read it:
http://docs.python.org/library/csv.html
There's the openpxyl package:
>>> from openpyxl import load_workbook
>>> wb2 = load_workbook('test.xlsx')
>>> print wb2.get_sheet_names()
['Sheet2', 'New Title', 'Sheet1']
>>> worksheet1 = wb2['Sheet1'] # one way to load a worksheet
>>> worksheet2 = wb2.get_sheet_by_name('Sheet2') # another way to load a worksheet
>>> print(worksheet1['D18'].value)
3
>>> for row in worksheet1.iter_rows():
>>> print row[0].value()
You can use xlpython package that requires xlrd only.
Find it here https://pypi.python.org/pypi/xlpython
and its documentation here https://github.com/morfat/xlpython
This may help:
This creates a node that takes a 2D List (list of list items) and pushes them into the excel spreadsheet. make sure the IN[]s are present or will throw and exception.
this is a re-write of the Revit excel dynamo node for excel 2013 as the default prepackaged node kept breaking. I also have a similar read node. The excel syntax in Python is touchy.
thnx #CodingNinja - updated : )
###Export Excel - intended to replace malfunctioning excel node
import clr
clr.AddReferenceByName('Microsoft.Office.Interop.Excel, Version=15.0.0.0, Culture=neutral, PublicKeyToken=71e9bce111e9429c')
##AddReferenceGUID("{00020813-0000-0000-C000-000000000046}") ''Excel C:\Program Files\Microsoft Office\Office15\EXCEL.EXE
##Need to Verify interop for version 2015 is 15 and node attachemnt for it.
from Microsoft.Office.Interop import * ##Excel
################################Initialize FP and Sheet ID
##Same functionality as the excel node
strFileName = IN[0] ##Filename
sheetName = IN[1] ##Sheet
RowOffset= IN[2] ##RowOffset
ColOffset= IN[3] ##COL OFfset
Data=IN[4] ##Data
Overwrite=IN[5] ##Check for auto-overwtite
XLVisible = False #IN[6] ##XL Visible for operation or not?
RowOffset=0
if IN[2]>0:
RowOffset=IN[2] ##RowOffset
ColOffset=0
if IN[3]>0:
ColOffset=IN[3] ##COL OFfset
if IN[6]<>False:
XLVisible = True #IN[6] ##XL Visible for operation or not?
################################Initialize FP and Sheet ID
xlCellTypeLastCell = 11 #####define special sells value constant
################################
xls = Excel.ApplicationClass() ####Connect with application
xls.Visible = XLVisible ##VISIBLE YES/NO
xls.DisplayAlerts = False ### ALerts
import os.path
if os.path.isfile(strFileName):
wb = xls.Workbooks.Open(strFileName, False) ####Open the file
else:
wb = xls.Workbooks.add# ####Open the file
wb.SaveAs(strFileName)
wb.application.visible = XLVisible ####Show Excel
try:
ws = wb.Worksheets(sheetName) ####Get the sheet in the WB base
except:
ws = wb.sheets.add() ####If it doesn't exist- add it. use () for object method
ws.Name = sheetName
#################################
#lastRow for iterating rows
lastRow=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Row
#lastCol for iterating columns
lastCol=ws.UsedRange.SpecialCells(xlCellTypeLastCell).Column
#######################################################################
out=[] ###MESSAGE GATHERING
c=0
r=0
val=""
if Overwrite == False : ####Look ahead for non-empty cells to throw error
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
if col.Value2 >"" :
OUT= "ERROR- Cannot overwrite"
raise ValueError("ERROR- Cannot overwrite")
##out.append(Data[0]) ##append mesage for error
############################################################################
for r, row in enumerate(Data): ####BASE 0## EACH ROW OF DATA ENUMERATED in the 2D array #range( RowOffset, lastRow + RowOffset):
for c, col in enumerate (row): ####BASE 0## Each colmn in each row is a cell with data ### in range(ColOffset, lastCol + ColOffset):
ws.Cells[r+1+RowOffset,c+1+ColOffset].Value2 = col.__str__()
##run macro disbled for debugging excel macro
##xls.Application.Run("Align_data_and_Highlight_Issues")
import pandas as pd
import os
files = os.listdir('path/to/files/directory/')
desiredFile = files[i]
filePath = 'path/to/files/directory/%s'
Ofile = filePath % desiredFile
xls_import = pd.read_csv(Ofile)
Now you can use the power of pandas DataFrames!
This code worked for me with Python 3.5.2. It opens and saves and excel. I am currently working on how to save data into the file but this is the code:
import csv
excel = csv.writer(open("file1.csv", "wb"))

Categories