openpyxl read tables from existing data book example? - python

In the openpyxl documentation there is an example of how to place a table into a workbook but there are no examples of how to find back the tables of a workbook. I have an XLS file that has named tables in it and I want to open the file, find all of the tables and parse them. I cannot find any documentation on how to do this. Can anyone help?
In the meantime I worked it out and wrote the following class to work with openpyxl:
class NamedArray(object):
''' Excel Named range object
Reproduces the named range feature of Microsoft Excel
Assumes a definition in the form <Worksheet PinList!$A$6:$A$52 provided by openpyxl
Written for use with, and initialised by the get_names function
After initialisation named array can be used in the same way as for VBA in excel
Written for openpyxl version 2.4.1, may not work with earlier versions
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, wb, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given workbook
'''
self.sheet, cellrange_str = str(named_range_raw).split('!')
self.sheet = self.sheet.replace("'",'') # remove the single quotes if they exist
self.loc = wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo, hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def value(self, row=1, col=1):
''' Returns the value at row, col
'''
assert row <= self.rows , 'invalid row number given'
assert col <= self.cols , 'invalid column number given'
return self.loc.cell(self.n_to_col(self.min_col + col-1)+str(self.row + row-1)).value
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
def __contains__(self, val):
rval = False
for row in range(1,self.rows+1):
for col in range(1,self.cols+1):
if self.value(row,col) == val:
rval = True
return rval
def vlookup(self, key, col):
''' excel style vlookup function
'''
assert col <= self.cols , 'invalid column number given'
rval = None
for row in range(1,self.rows+1):
if self.value(row,1) == key:
rval = self.value(row, col)
break
return rval
def hlookup(self, key, row):
''' excel style hlookup function
'''
assert row <= self.rows , 'invalid row number given'
rval = None
for col in range(1,self.cols+1):
if self.value(1,col) == key:
rval = self.value(row, col)
break
return rval
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls, n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
def get_names(workbook, filt='', debug=False):
''' Create a structure containing all of the names in the given workbook
filt is an optional parameter and used to create a subset of names starting with filt
useful for IO_ring_spreadsheet as all names start with 'n_'
if present, filt characters are stipped off the front of the name
'''
named_ranges = workbook.defined_names.definedName
name_list = {}
for named_range in named_ranges:
name = named_range.name
if named_range.attr_text.startswith('#REF'):
print('WARNING: named range "', name, '" is undefined')
elif filt == '' or name.startswith(filt):
name_list[name[len(filt):]] = NamedArray(workbook, named_range.attr_text)
if debug:
with open("H:\\names.txt",'w') as log:
for item in name_list:
print (item, '=', name_list[item])
log.write(item.ljust(30) + ' = ' + str(name_list[item])+'\n')
return name_list

I agree that the documentation does not really help, and the public API also seems to have only add_table() method.
But then I found an openpyxl Issue 844 asking for a better interface, and it shows that worksheet has an _tables property.
This is enough to get a list of all tables in a file, together with some basic properties:
from openpyxl import load_workbook
wb = load_workbook(filename = 'test.xlsx')
for ws in wb.worksheets:
print("Worksheet %s include %d tables:" % (ws.title, len(ws._tables)))
for tbl in ws._tables:
print(" : " + tbl.displayName)
print(" - name = " + tbl.name)
print(" - type = " + (tbl.tableType if isinstance(tbl.tableType, str) else 'n/a')
print(" - range = " + tbl.ref)
print(" - #cols = %d" % len(tbl.tableColumns))
for col in tbl.tableColumns:
print(" : " + col.name)
Note that the if/else construct is required for the tableType, since it can return NoneType (for standard tables), which is not convertible to str.

Building on #MichalKaut's answer, I created a simple function that returns a dictionary with all tables in a given workbook. It also puts each table's data into a Pandas DataFrame.
from openpyxl import load_workbook
import pandas as pd
def get_all_tables(filename):
""" Get all tables from a given workbook. Returns a dictionary of tables.
Requires a filename, which includes the file path and filename. """
# Load the workbook, from the filename, setting read_only to False
wb = load_workbook(filename=file, read_only=False, keep_vba=False, data_only=True, keep_links=False)
# Initialize the dictionary of tables
tables_dict = {}
# Go through each worksheet in the workbook
for ws_name in wb.sheetnames:
print("")
print(f"worksheet name: {ws_name}")
ws = wb[ws_name]
print(f"tables in worksheet: {len(ws.tables)}")
# Get each table in the worksheet
for tbl in ws.tables.values():
print(f"table name: {tbl.name}")
# First, add some info about the table to the dictionary
tables_dict[tbl.name] = {
'table_name': tbl.name,
'worksheet': ws_name,
'num_cols': len(tbl.tableColumns),
'table_range': tbl.ref}
# Grab the 'data' from the table
data = ws[tbl.ref]
# Now convert the table 'data' to a Pandas DataFrame
# First get a list of all rows, including the first header row
rows_list = []
for row in data:
# Get a list of all columns in each row
cols = []
for col in row:
cols.append(col.value)
rows_list.append(cols)
# Create a pandas dataframe from the rows_list.
# The first row is the column names
df = pd.DataFrame(data=rows_list[1:], index=None, columns=rows_list[0])
# Add the dataframe to the dictionary of tables
tables_dict[tbl.name]['dataframe'] = df
return tables_dict
# File location:
file = r"C:\Users\sean\spreadsheets\full_of_tables.xlsx"
# Run the function to return a dictionary of all tables in the Excel workbook
tables_dict = get_all_tables(filename=file)

The answer to this has changed.
ws objects now contain the tables accessor which acts as a dictionary. Updated answer is:
tmp = [ws.tables for ws in wb.worksheets]
tbls = [{v.name:v} for t in tmp for v in t.values()]

I'm not sure what you mean by parsing but read-support for worksheet tables has been possible since version 2.4.4. If you have questions about the details then I suggest you ask your question on the openpyxl mailing list as that is a more suitable place for this kind of discussion.

I don't think this is possible. I seems to work similarly to images; if you read and save a file with a table it will get striped.

Related

Show excel data in only one Sheet

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

Copy and paste data in excel using python (Keep source formatting)

I want to copy the data from one sheet in a workbook to another sheet in a workbook with source formatting (Cell pattern, cell border, font style, font colour, font size, header and footer).
This is the code i used, it does the work but it failed to keep the source formatting. It would be great if you guys have anything in mind that might be helpful in this topic.
SOURCE CODE:
import openpyxl as xl;
from copy import copy
wb1 = xl.load_workbook('C:\\Users\\high.xlsx')
ws1 = wb1.active
filename = 'C:\\Users\\3.0.xlsx'
wb2 = xl.load_workbook(filename)
ws2 = wb2['Session']
mr = ws1.max_row
mc = ws1.max_column
mr2 = ws2.max_row
mc2 = ws2.max_column
ws2.delete_cols(1, mc2+1)
ws2.delete_rows(1, mr2+1)
for i in range (1, mr + 1):
for j in range (1, mc + 1):
c = ws1.cell(row = i, column = j)
ws2.cell(row = i, column = j).value = c.value
wb2.save(str(filename))
There's a number of attributes on the cell object which you can copy over which have the style information. Trying to copy them directly will fail, but using the copy module will work.
Style information can also be attached to rows and columns - they seem to be attached to the dimension objects on the worksheet.
Update: Oops, looks like this has already been answered - I'll leave this here though since it includes the row and column level styles.
import openpyxl as xl
from copy import copy
style_attrs = ["alignment", "border", "fill", "font", "number_format", "protection"]
def cells(worksheet):
"""Return a generator for the sequence of cells in the worksheet"""
for row in worksheet:
for cell in row:
yield cell
def copy_attrs(src, dst, attrs=style_attrs):
"""Copy attributes from src to dst. Attributes are shallow-copied to avoid
TypeError: unhashable type: 'StyleProxy'"""
for name in attrs:
setattr(dst, name, copy(getattr(src, name)))
def copy_column_attrs(worksheet_src, worksheet_dst, attrs=style_attrs + ["width"]):
"""Copy ColumnDimension properties from worksheet_src to worksheet_dst.
Only properties listed in attrs will be copied."""
for column, dimensions in worksheet_src.column_dimensions.items():
copy_attrs(
src=dimensions,
dst=worksheet_dst.column_dimensions[column],
attrs=attrs,
)
def copy_row_attrs(worksheet_src, worksheet_dst, attrs=style_attrs + ["height"]):
"""Copy RowDimension properties from worksheet_src to worksheet_dst.
Only properties listed in attrs will be copied."""
for row, dimensions in worksheet_src.row_dimensions.items():
copy_attrs(
src=dimensions,
dst=worksheet_dst.row_dimensions[row],
attrs=style_attrs + ["height"],
)
def copy_cells(worksheet_src, worksheet_dst, attrs=style_attrs):
"""Copy cells from worksheet_src to worksheet_dst. If cells are styled
then also copy the attributes listed in attrs."""
for cell in cells(worksheet_src):
cell_dst = worksheet_dst.cell(row=cell.row, column=cell.column)
if cell.has_style:
copy_attrs(cell, cell_dst, attrs=attrs)
cell_dst.value = cell.value
def delete_worksheet_cells(worksheet):
worksheet.delete_cols(1, worksheet.max_column + 1)
worksheet.delete_rows(1, worksheet.max_row + 1)
wb_src = xl.load_workbook("a.xlsx")
ws_src = wb_src.active
wb_dst = xl.load_workbook("b.xlsx")
ws_dst = wb_dst.active
delete_worksheet_cells(ws_dst)
copy_column_attrs(ws_src, ws_dst)
copy_row_attrs(ws_src, ws_dst)
copy_cells(ws_src, ws_dst)
wb_dst.save("b.xlsx")

openpyxl read Table to python

I'm trying to read an excel Table object into python and can't find any syntax for doing so.
It would be useful to read a whole table into e.g. a dict of dicts (I'm trying not to pull in pandas as a dependency for this particular project).
I can't find any way of doing this.
Below code will read through the table row by row, Also you can specify the range
import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
sheet['A1':'B7']
for i1,i2 in sheet:
print("{0:8} {1:8}".format(i1.value,i2.value))
Output:
Student_name Marks
Tony Stark 47
Loki 59
Oddin 73
Nick Fury 62
Samaul 75
Peter Parkar 80
I got it working as follows:
import openpyxl
def all_tables_data(filename: str) -> dict:
"""
Get values for all tables in a spreadsheet.
Returns a dict of tables, keyed by table name.
Table values are given as lists of lists.
"""
workbook = openpyxl.load_workbook(filename)
tables_by_name = {}
table_worksheets = {}
for worksheet in workbook.worksheets:
for table in worksheet._tables:
tables_by_name[table.name] = table
table_worksheets[table.name] = worksheet
def get_vals(table_name: str) -> list:
worksheet = table_worksheets[table_name]
cell_range = worksheet[tables_by_name[table_name].ref]
return [[cell.value for cell in row] for row in cell_range]
return {table_name: get_vals(table_name) for table_name in tables_by_name}
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
TABLES = all_tables_data(FILENAME)
print(TABLES)
I'm interested in converting tables to dicts.
Often the rows/entries in a table may have >1 key.
Assuming the table has a header row, I also cobbled the code below together.
It can be used to convert a table to a dict of rows, with each row being a dict of values, keyed by the column headers.
class WorkbookWithTables:
def __init__(self, workbook):
self.workbook = workbook
self.tables_by_name = {}
self.table_worksheets = {}
for worksheet in self.workbook.worksheets:
for table in worksheet._tables:
self.tables_by_name[table.name] = table
self.table_worksheets[table.name] = worksheet
#classmethod
def from_file(cls, filename):
_workbook = openpyxl.load_workbook(filename)
return cls(_workbook)
def table_to_dict(self, table_name, n_keys=1):
worksheet = self.table_worksheets[table_name]
cell_range = worksheet[self.tables_by_name[table_name].ref]
table_dict = {}
value_headers = tuple(cell.value for cell in cell_range[0][n_keys:])
for row in cell_range[1:]:
if n_keys == 1:
key = row[0].value
else:
key = tuple(partial_key.value for partial_key in row[:n_keys])
values = {
value_headers[i]: cell.value
for i, cell in enumerate(row[n_keys:])
}
table_dict[key] = values
return table_dict
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
WB = WorkbookWithTables.from_file(FILENAME)
MY_SINGLE_KEYED_TABLE = WB.table_to_dict("my_single_keyed_table")
MY_DOUBLE_KEYED_TABLE = WB.table_to_dict("my_double_keyed_table", 2)

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

Python - How to create an Excel Calculated Field without modifying original source of Data

I have 2 tables on Excel:
.
I've created an excel Pivot Table using Python but I could not find a simple way to create a calculated field inside it (like I would do with VB) which matches Region from left table and Region from right table.
So I did this, using the module win32com.client:
First, stored the content of the tables in two lists : myTable and myRates.
Then, added a new column to the original left table where I calculated CA * (1 + rate). The code here:
calField = [['CA Bonifié']] #first element as a title for the new column :
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
What it does in the sheet "source":
What it does in the created sheet "TCD":
Result is ok but I don't like this method as it alterates the original table. So I'm looking a simplest method to do that.
Thanks in advance for your help
PS : The whole code below. May it help.
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
Excel.Visible = True
wb = Excel.Workbooks.Open('C:/Users/Documents/Python/classeur.xlsx')
Sheet1 = wb.Worksheets('Source')
def getContiguousRange(fichier, sheet, row, col):
bottom = row
while sheet.Cells(bottom + 1, col).Value not in [None, '']:
bottom = bottom + 1
right = col
while sheet.Cells(row, right + 1).Value not in [None, '']:
right = right + 1
return sheet.Range(sheet.Cells(row, col), sheet.Cells(bottom, right)).Value
myTable = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 1)
myRates = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 8)
calField = [['CA Bonifié']]
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
cl1 = Sheet1.Cells(1,1)
cl2 = Sheet1.Cells(len(myTable),len(myTable[0])+1)
pivotSourceRange = Sheet1.Range(cl1,cl2)
pivotSourceRange.Select()
Sheet2 = wb.Sheets.Add (After=wb.Sheets (1))
Sheet2.Name = 'TCD'
cl3=Sheet2.Cells(4,1)
pivotTargetRange= Sheet2.Range(cl3,cl3)
pivotTableName = 'tableauCroisé'
pivotCache = wb.PivotCaches().Create(SourceType=win32c.xlDatabase, SourceData=pivotSourceRange, Version=win32c.xlPivotTableVersion14)
pivotTable = pivotCache.CreatePivotTable(TableDestination=pivotTargetRange, TableName=pivotTableName, DefaultVersion=win32c.xlPivotTableVersion14)
pivotTable.PivotFields('Service').Orientation = win32c.xlRowField
pivotTable.PivotFields('Service').Position = 1
pivotTable.PivotFields('Region').Orientation = win32c.xlPageField
pivotTable.PivotFields('Region').Position = 1
pivotTable.PivotFields('Region').CurrentPage = 'IDF'
dataField = pivotTable.AddDataField(pivotTable.PivotFields('CA'))
dataField.NumberFormat = '# ### €'
calculField = pivotTable.AddDataField(pivotTable.PivotFields('CA Bonifié'))
calculField.NumberFormat = '# ### €'
# wb.SaveCopyAs('C:/Users/Documents/Python/tcd.xlsx')
# wb.Close(True)
# Excel.Application.Quit()
Note: I'm using Sheet1 as the Image show all relevant indices and its easier to verify.
You can move the Formula to the PivotTabel at a later Step, once verified.
STEP Replace Column E with the Formula =VLOOKUP
Reference: how-to-use-vlookup-match
Replace the following in your Code:
for row, testDataRow in enumerate(calField, 2):
#Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
Sheet1.Cells(row, 5).Formula = '=VLOOKUP(A{}, H1:I5, MATCH(H1,H1:I1))'.format(row)
The Result should show the matching Taux!
Come back and confirm Results are OK!
STEP Compute Taux

Categories