Excel Python using openpyxl - python

I extracted informations from ERP database of my company and with these data I have to add them to a sheet already created. But unfortunately my problem is being in how to use these commands.. The openpyxl documentation is not help me :(
This is an example of my sheet.
import sys
import datetime
from openpyxl import load_workbook
data = datetime.datetime.now()
wb = load_workbook('/home/multipla/Documentos/test.xlsx')
ws = wb.active
for i in ws['B3':'F12']:
for j in i:
print j
#Edit:
Write these cell "Hands On":
ws["B3"].value = '2324'
ws["C3"].value = 'Patty'
ws["D3"].value = ''
ws["E3"].value = "YES"
ws["F3"].value = "Reading"
Tried to automatize that process, by making a For Loop to go through each cell and write them...
#Edit2

If you have a list of dictionaries with your properties, that is:
object_1 = dict()
object_1['ID'] = 1337
object_1['NAME'] = 'Pencil'
...
object_2 = dict()
object_2['ID'] = 1338
object_2['NAME'] = 'Eraser'
...
object_list = list()
object_list.append(object_1)
object_list.append(object_2)
Then you could do something like this:
def add_object(ob, row):
ws.cell(column=2, row=row).value = ob['ID']
ws.cell(column=3, row=row).value = ob['NAME']
ws.cell(column=4, row=row).value = ob['Y']
ws.cell(column=5, row=row).value = ob['X']
ws.cell(column=6, row=row).value = ob['ISSUE']
def add_object_list(ob_list):
for i, ob in enumerate(ob_list):
add_object(ob, i + 3)
add_object_list(object_list)
Result:

Related

Show excel data in only one Sheet

I'm having some doubts with the following function. I want it to show me the result in a single excel tab but I can't.
def create_df_from_table(c,tab, excelWriter):
list_name = str(c)+"_result_list"
list_name = []
for i,each_row in enumerate(each_tab.rows):
text = (each_cell.text for each_cell in each_row.cells)
if i == -1:
keys = tuple(text)
else:
each_dict_val = tuple(text)
list_name.append(each_dict_val)
list_name_copy = list_name.copy()
result_df = pd.DataFrame(list_name)
print(result_df)
result_df.to_excel(excelWriter, sheet_name=str(c))
return result_df
excelWriter = pd.ExcelWriter('tablasFromDocx1.xlsx')
for c, each_tab in enumerate(file.tables):
globals()[f'result_df_{c}'] = create_df_from_table(c,each_tab, excelWriter)
excelWriter.save()
The code above in line 14 (result_df.to_excel() ) passes the dataframe to excel but in more than one tab and I need only all the data in one

openpyxl read Table to python

I'm trying to read an excel Table object into python and can't find any syntax for doing so.
It would be useful to read a whole table into e.g. a dict of dicts (I'm trying not to pull in pandas as a dependency for this particular project).
I can't find any way of doing this.
Below code will read through the table row by row, Also you can specify the range
import openpyxl
wb = openpyxl.load_workbook('example.xlsx')
sheet = wb.active
sheet['A1':'B7']
for i1,i2 in sheet:
print("{0:8} {1:8}".format(i1.value,i2.value))
Output:
Student_name Marks
Tony Stark 47
Loki 59
Oddin 73
Nick Fury 62
Samaul 75
Peter Parkar 80
I got it working as follows:
import openpyxl
def all_tables_data(filename: str) -> dict:
"""
Get values for all tables in a spreadsheet.
Returns a dict of tables, keyed by table name.
Table values are given as lists of lists.
"""
workbook = openpyxl.load_workbook(filename)
tables_by_name = {}
table_worksheets = {}
for worksheet in workbook.worksheets:
for table in worksheet._tables:
tables_by_name[table.name] = table
table_worksheets[table.name] = worksheet
def get_vals(table_name: str) -> list:
worksheet = table_worksheets[table_name]
cell_range = worksheet[tables_by_name[table_name].ref]
return [[cell.value for cell in row] for row in cell_range]
return {table_name: get_vals(table_name) for table_name in tables_by_name}
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
TABLES = all_tables_data(FILENAME)
print(TABLES)
I'm interested in converting tables to dicts.
Often the rows/entries in a table may have >1 key.
Assuming the table has a header row, I also cobbled the code below together.
It can be used to convert a table to a dict of rows, with each row being a dict of values, keyed by the column headers.
class WorkbookWithTables:
def __init__(self, workbook):
self.workbook = workbook
self.tables_by_name = {}
self.table_worksheets = {}
for worksheet in self.workbook.worksheets:
for table in worksheet._tables:
self.tables_by_name[table.name] = table
self.table_worksheets[table.name] = worksheet
#classmethod
def from_file(cls, filename):
_workbook = openpyxl.load_workbook(filename)
return cls(_workbook)
def table_to_dict(self, table_name, n_keys=1):
worksheet = self.table_worksheets[table_name]
cell_range = worksheet[self.tables_by_name[table_name].ref]
table_dict = {}
value_headers = tuple(cell.value for cell in cell_range[0][n_keys:])
for row in cell_range[1:]:
if n_keys == 1:
key = row[0].value
else:
key = tuple(partial_key.value for partial_key in row[:n_keys])
values = {
value_headers[i]: cell.value
for i, cell in enumerate(row[n_keys:])
}
table_dict[key] = values
return table_dict
if __name__ == "__main__":
FILENAME = "my_spreadsheet.xlsx"
WB = WorkbookWithTables.from_file(FILENAME)
MY_SINGLE_KEYED_TABLE = WB.table_to_dict("my_single_keyed_table")
MY_DOUBLE_KEYED_TABLE = WB.table_to_dict("my_double_keyed_table", 2)

Filtering text from a Pdf using PyPDF2 to Excel Sheet with nothing Showing

import PyPDF2
import re
import xlsxwriter
docsFile = open('image0001.pdf','rb')
pdfReader = PyPDF2.PdfFileReader(docsFile)
loanNumberlist = []
loan2Matchlist = []
poolNumlist = []
borrowerNamelist = []
wb = xlsxwriter.Workbook('docInfo.xlsx')
ws = wb.add_worksheet('sheet2')
row = 0
columnHeaders = ['Borrower Name', 'Loan Number', 'LD Loan Number', 'Pool #']
for col, colname in enumerate(columnHeaders, start=0):
ws.write(row, col, colname)
class pdfExtract:
def __init__(self, pg):
self.pg = pg
def extractShit(self):
pageObj = pdfReader.getpage(self.pg)
pgData = pageObj.extractText()
loanNumber = re.split('\\bLoan #:\\b', pgData)[-1]
loanNumberlist.append(loanNumber)
loan2Match = re.match(r"?:/\d{0,10}", pgData)[-1]
loan2Matchlist.append(loan2Match)
poolNumber = re.split('\\bPool #:\\b',pgData)[-1]
poolNumlist.append(poolNumber)
borrowerName =re.split('\\bBorrower #:\\b',pgData)[-1]
borrowerNamelist.append(borrowerName)
for page in range(0, 223):
pdfExtract(page)
for row, rowvar in enumerate(borrowerNamelist, start=1):#write Borrower name
col = 0
ws.write_string(row, col, rowvar)
for row, lnNM in enumerate(loanNumberlist, start=1):#write loan number 1
col = 1
ws.write_number(row, col, lnNM)
for row, lnNM2 in enumerate(loan2Matchlist, start=1):#write loan number 2
col = 2
ws.write_number(row, col, lnNM2)
for row, plNm in enumerate(poolNumlist, start=1):#write pool number
col = 3
ws.write_number(row, col, plNm)
wb.close()
So, I wrote this program to grab data from a pdf file and return 4 things in each page and put them into an excel file. That looks like:
Loan #: 0065192080/3000009289
Pool#: AK1576
Borrower: David h Theman
I have to grab each page get the first loan number, then the second loan number(after the “/“). Then the rest.
It runs through, but all I get to see on the excel file is the headers no data and no errors.
I thought it was how I'm returning the data or how I'm writing it, but it has no changes. Would it have to do with my For loops? The regex code I got from different answers on here. I've changed how I write it to the excel file, but no luck.

openpyxl read tables from existing data book example?

In the openpyxl documentation there is an example of how to place a table into a workbook but there are no examples of how to find back the tables of a workbook. I have an XLS file that has named tables in it and I want to open the file, find all of the tables and parse them. I cannot find any documentation on how to do this. Can anyone help?
In the meantime I worked it out and wrote the following class to work with openpyxl:
class NamedArray(object):
''' Excel Named range object
Reproduces the named range feature of Microsoft Excel
Assumes a definition in the form <Worksheet PinList!$A$6:$A$52 provided by openpyxl
Written for use with, and initialised by the get_names function
After initialisation named array can be used in the same way as for VBA in excel
Written for openpyxl version 2.4.1, may not work with earlier versions
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, wb, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given workbook
'''
self.sheet, cellrange_str = str(named_range_raw).split('!')
self.sheet = self.sheet.replace("'",'') # remove the single quotes if they exist
self.loc = wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo, hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def value(self, row=1, col=1):
''' Returns the value at row, col
'''
assert row <= self.rows , 'invalid row number given'
assert col <= self.cols , 'invalid column number given'
return self.loc.cell(self.n_to_col(self.min_col + col-1)+str(self.row + row-1)).value
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
def __contains__(self, val):
rval = False
for row in range(1,self.rows+1):
for col in range(1,self.cols+1):
if self.value(row,col) == val:
rval = True
return rval
def vlookup(self, key, col):
''' excel style vlookup function
'''
assert col <= self.cols , 'invalid column number given'
rval = None
for row in range(1,self.rows+1):
if self.value(row,1) == key:
rval = self.value(row, col)
break
return rval
def hlookup(self, key, row):
''' excel style hlookup function
'''
assert row <= self.rows , 'invalid row number given'
rval = None
for col in range(1,self.cols+1):
if self.value(1,col) == key:
rval = self.value(row, col)
break
return rval
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls, n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
def get_names(workbook, filt='', debug=False):
''' Create a structure containing all of the names in the given workbook
filt is an optional parameter and used to create a subset of names starting with filt
useful for IO_ring_spreadsheet as all names start with 'n_'
if present, filt characters are stipped off the front of the name
'''
named_ranges = workbook.defined_names.definedName
name_list = {}
for named_range in named_ranges:
name = named_range.name
if named_range.attr_text.startswith('#REF'):
print('WARNING: named range "', name, '" is undefined')
elif filt == '' or name.startswith(filt):
name_list[name[len(filt):]] = NamedArray(workbook, named_range.attr_text)
if debug:
with open("H:\\names.txt",'w') as log:
for item in name_list:
print (item, '=', name_list[item])
log.write(item.ljust(30) + ' = ' + str(name_list[item])+'\n')
return name_list
I agree that the documentation does not really help, and the public API also seems to have only add_table() method.
But then I found an openpyxl Issue 844 asking for a better interface, and it shows that worksheet has an _tables property.
This is enough to get a list of all tables in a file, together with some basic properties:
from openpyxl import load_workbook
wb = load_workbook(filename = 'test.xlsx')
for ws in wb.worksheets:
print("Worksheet %s include %d tables:" % (ws.title, len(ws._tables)))
for tbl in ws._tables:
print(" : " + tbl.displayName)
print(" - name = " + tbl.name)
print(" - type = " + (tbl.tableType if isinstance(tbl.tableType, str) else 'n/a')
print(" - range = " + tbl.ref)
print(" - #cols = %d" % len(tbl.tableColumns))
for col in tbl.tableColumns:
print(" : " + col.name)
Note that the if/else construct is required for the tableType, since it can return NoneType (for standard tables), which is not convertible to str.
Building on #MichalKaut's answer, I created a simple function that returns a dictionary with all tables in a given workbook. It also puts each table's data into a Pandas DataFrame.
from openpyxl import load_workbook
import pandas as pd
def get_all_tables(filename):
""" Get all tables from a given workbook. Returns a dictionary of tables.
Requires a filename, which includes the file path and filename. """
# Load the workbook, from the filename, setting read_only to False
wb = load_workbook(filename=file, read_only=False, keep_vba=False, data_only=True, keep_links=False)
# Initialize the dictionary of tables
tables_dict = {}
# Go through each worksheet in the workbook
for ws_name in wb.sheetnames:
print("")
print(f"worksheet name: {ws_name}")
ws = wb[ws_name]
print(f"tables in worksheet: {len(ws.tables)}")
# Get each table in the worksheet
for tbl in ws.tables.values():
print(f"table name: {tbl.name}")
# First, add some info about the table to the dictionary
tables_dict[tbl.name] = {
'table_name': tbl.name,
'worksheet': ws_name,
'num_cols': len(tbl.tableColumns),
'table_range': tbl.ref}
# Grab the 'data' from the table
data = ws[tbl.ref]
# Now convert the table 'data' to a Pandas DataFrame
# First get a list of all rows, including the first header row
rows_list = []
for row in data:
# Get a list of all columns in each row
cols = []
for col in row:
cols.append(col.value)
rows_list.append(cols)
# Create a pandas dataframe from the rows_list.
# The first row is the column names
df = pd.DataFrame(data=rows_list[1:], index=None, columns=rows_list[0])
# Add the dataframe to the dictionary of tables
tables_dict[tbl.name]['dataframe'] = df
return tables_dict
# File location:
file = r"C:\Users\sean\spreadsheets\full_of_tables.xlsx"
# Run the function to return a dictionary of all tables in the Excel workbook
tables_dict = get_all_tables(filename=file)
The answer to this has changed.
ws objects now contain the tables accessor which acts as a dictionary. Updated answer is:
tmp = [ws.tables for ws in wb.worksheets]
tbls = [{v.name:v} for t in tmp for v in t.values()]
I'm not sure what you mean by parsing but read-support for worksheet tables has been possible since version 2.4.4. If you have questions about the details then I suggest you ask your question on the openpyxl mailing list as that is a more suitable place for this kind of discussion.
I don't think this is possible. I seems to work similarly to images; if you read and save a file with a table it will get striped.

Creating pivot table in Excel using python causes pywintypes.com_error

I adapted the following code found here to create a pivot table in my existing excel sheet:
import win32com.client as win32
win32c = win32.constants
import sys
import itertools
tablecount = itertools.count(1)
def addpivot(wb,sourcedata,title,filters=(),columns=(),
rows=(),sumvalue=(),sortfield=""):
newsheet = wb.Sheets.Add()
newsheet.Cells(1,1).Value = title
newsheet.Cells(1,1).Font.Size = 16
tname = "PivotTable%d"%tablecount.next()
pc = wb.PivotCaches().Add(SourceType=win32c.xlDatabase,
SourceData=sourcedata)
pt = pc.CreatePivotTable(TableDestination="%s!R4C1"%newsheet.Name,
TableName=tname,
DefaultVersion=win32c.xlPivotTableVersion10)
for fieldlist,fieldc in ((filters,win32c.xlPageField),
(columns,win32c.xlColumnField),
(rows,win32c.xlRowField)):
for i,val in enumerate(fieldlist):
wb.ActiveSheet.PivotTables(tname).PivotFields(val).Orientation = fieldc
wb.ActiveSheet.PivotTables(tname).PivotFields(val).Position = i+1
wb.ActiveSheet.PivotTables(tname).AddDataField(wb.ActiveSheet.PivotTables(tname).
PivotFields(sumvalue),sumvalue,win32c.xlSum)
def runexcel():
excel = win32.gencache.EnsureDispatch('Excel.Application')
#excel.Visible = True
try:
wb = excel.Workbooks.Open('18.03.14.xls')
except:
print "Failed to open spreadsheet 18.03.14.xls"
sys.exit(1)
ws = wb.Sheets('defaulters')
xldata = ws.UsedRange.Value
newdata = []
for row in xldata:
if len(row) == 4 and row[-1] is not None:
newdata.append(list(row))
rowcnt = len(newdata)
colcnt = len(newdata[0])
wsnew = wb.Sheets.Add()
wsnew.Range(wsnew.Cells(1,1),wsnew.Cells(rowcnt,colcnt)).Value = newdata
wsnew.Columns.AutoFit()
src = "%s!R1C1:R%dC%d"%(wsnew.Name,rowcnt,colcnt)
addpivot(wb,src,
title="Employees by leads",
filters=("Leads",),
columns=(),
rows=("Name",),
sumvalue="Actual hours",
sortfield=())
if int(float(excel.Version)) >= 12:
wb.SaveAs('new18.03.14.xlsx',win32c.xlOpenXMLWorkbook)
else:
wb.SaveAs('new18.03.14.xls')
excel.Application.Quit()
if __name__ == "__main__":
runexcel()
This line of code,
wb.ActiveSheet.PivotTables(tname).AddDataField(wb.ActiveSheet.PivotTables(tname).PivotFields(sumvalue),sumvalue,win32c.xlSum)
returns the following error:
pywintypes.com_error: (-2147352567, 'Exception occurred.', (0, u'Microsoft Excel', u'PivotFields method of PivotTable class failed', u'xlmain11.chm', 0, -2146827284), None).
When I remove that line, the pivot table is generated without any data fields. Is there something I'm doing wrong?
As this is the one of the first Google hits when searching for Excel pivot tables from Python, I post my example code. This code generates a simple pivot table in Excel through a COM server, with some basic filters, columns, rows, and some number formatting applied.
I hope this helps someone not to waste half a day on it (like I did...)
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application') # Excel = win32com.client.Dispatch('Excel.Application')
win32c = win32com.client.constants
wb = Excel.Workbooks.Add()
Sheet1 = wb.Worksheets("Sheet1")
TestData = [['Country','Name','Gender','Sign','Amount'],
['CH','Max' ,'M','Plus',123.4567],
['CH','Max' ,'M','Minus',-23.4567],
['CH','Max' ,'M','Plus',12.2314],
['CH','Max' ,'M','Minus',-2.2314],
['CH','Sam' ,'M','Plus',453.7685],
['CH','Sam' ,'M','Minus',-53.7685],
['CH','Sara','F','Plus',777.666],
['CH','Sara','F','Minus',-77.666],
['DE','Hans','M','Plus',345.088],
['DE','Hans','M','Minus',-45.088],
['DE','Paul','M','Plus',222.455],
['DE','Paul','M','Minus',-22.455]]
for i, TestDataRow in enumerate(TestData):
for j, TestDataItem in enumerate(TestDataRow):
Sheet1.Cells(i+2,j+4).Value = TestDataItem
cl1 = Sheet1.Cells(2,4)
cl2 = Sheet1.Cells(2+len(TestData)-1,4+len(TestData[0])-1)
PivotSourceRange = Sheet1.Range(cl1,cl2)
PivotSourceRange.Select()
Sheet2 = wb.Worksheets(2)
cl3=Sheet2.Cells(4,1)
PivotTargetRange= Sheet2.Range(cl3,cl3)
PivotTableName = 'ReportPivotTable'
PivotCache = wb.PivotCaches().Create(SourceType=win32c.xlDatabase, SourceData=PivotSourceRange, Version=win32c.xlPivotTableVersion14)
PivotTable = PivotCache.CreatePivotTable(TableDestination=PivotTargetRange, TableName=PivotTableName, DefaultVersion=win32c.xlPivotTableVersion14)
PivotTable.PivotFields('Name').Orientation = win32c.xlRowField
PivotTable.PivotFields('Name').Position = 1
PivotTable.PivotFields('Gender').Orientation = win32c.xlPageField
PivotTable.PivotFields('Gender').Position = 1
PivotTable.PivotFields('Gender').CurrentPage = 'M'
PivotTable.PivotFields('Country').Orientation = win32c.xlColumnField
PivotTable.PivotFields('Country').Position = 1
PivotTable.PivotFields('Country').Subtotals = [False, False, False, False, False, False, False, False, False, False, False, False]
PivotTable.PivotFields('Sign').Orientation = win32c.xlColumnField
PivotTable.PivotFields('Sign').Position = 2
DataField = PivotTable.AddDataField(PivotTable.PivotFields('Amount'))
DataField.NumberFormat = '#\'##0.00'
Excel.Visible = 1
wb.SaveAs('ranges_and_offsets.xlsx')
Excel.Application.Quit()
Found from PivotTable.AddDataField method (Excel) that in expression .AddDataField(Field, Caption, Function) only Field is required and the other two parameters are optional. I removed them and the code works fine!

Categories