I am creating worksheets on a fly and not naming them anything. I am unable to freeze the first column and row. I tired working with naming the sheet when adding it to the workbook and it works. However doesn't work on the fly. Below is the code
base = xlwt.Workbook()
for k,v in MainDict.items():
base.add_sheet(k.upper())
col_width = 256 * 50
xlwt.add_palette_colour("custom_colour", 0x21)
pattern = 'url:(.*)'
search = re.compile(pattern)
base.set_colour_RGB(0x21, 251, 228, 228)
style = xlwt.easyxf('pattern: pattern solid, fore_colour custom_colour;font : bold on;alignment: horiz center;font: name Times New Roman size 20;font:underline single')
index = MainDict.keys().index(k)
ws = base.get_sheet(index)
ws.set_panes_frozen(True)
try:
for i in itertools.count():
ws.col(i).width = col_width
except ValueError:
pass
style1 = xlwt.easyxf('font: name Times New Roman size 15')
style2 = xlwt.easyxf('font : bold on;font: name Times New Roman size 12')
col=0
for sk in MainDict[k].keys():
ws.write(0,col,sk.upper(),style)
col+=1
row =1
for mk in MainDict[k][sk].keys():
for lk,lv in MainDict[k][sk][mk].items():
for items in lv:
text = ('%s URL: %s')%(items,lk)
links =('No data Found. Please visit the URL: %s')% (lk)
url = re.findall(pattern,text)
if len(items) != 0:
if re.match(pattern,text)==True:
ws.write(row,col-1,url,style2)
else:
ws.write(row,col-1,text,style1)
row+=1
else:
ws.write(row,col-1,links,style2)
#ws.Column(col-1,ws).width = 10000
row+=1
default_book_style = base.default_style
default_book_style.font.height = 20 * 36
base.save('project7.xls')
You have to use
ws.set_panes_frozen(True)
ws.set_horz_split_pos(1)
ws.set_vert_split_pos(1)
to make frozen take effect.
The reason this isn't working may be the result of the "get_sheet() function. Instead, store the add_sheet() call to "ws" and use that:
#base.add_sheet(k.upper())
ws = base.add_sheet(k.upper())
And then you need this sequence of attributes to freeze top row:
#ws = base.get_sheet(index)
#ws.set_panes_frozen(True)
ws.set_horz_split_pos(1)
ws.set_vert_split_pos(1)
ws.panes_frozen = True
ws.remove_splits = True
I tested this using your code snippet and it works on my end.
For reference, you can set these attributes either via function or as assignment:
ws.set_panes_frozen(True)
ws.set_remove_splits(True)
Related
I'm currently stuck when it comes to color coding cells with specific values containing =="PPI". I'm using openpyxl to save an excel sheet that will state the color code RedFill if these cells in column B contain the value "PPI". I have a feeling I'm missing something specific in my code.
from openpyxl import load_workbook
from openpyxl.styles import Font, Color, Alignment, Border, Side, PatternFill
filename='DB_1.xlsx'
workbook = load_workbook(filename='DB_1.xlsx')
sheet = workbook.active
bold_font = Font(bold=True)
big_blue_text = Font(color="000066CC", size=11)
center_aligned_text = Alignment(horizontal="center")
redFill = PatternFill(start_color='FFFF0000',end_color='FFFF0000',fill_type='solid')
cell_1= sheet["G1"]
cell_1
cell_1.value = " Notes "
cell_1.value
cell_2 = sheet["A1"]
cell_2
cell_2.value = " Primary Key"
cell_2.value
sheet.auto_filter.ref = "A1:G1"
for c in sheet["A1:G1"][0]:
c.font = bold_font
for a in sheet["A1:G1"][0]:
a.font = big_blue_text
for b in sheet["A1:G1"][0]:
b.alignment = center_aligned_text
for d in sheet["B:B"]:
if d == "PPI":
d.PatternFill = RedFill
More specifically:
for d in sheet["B:B"]:
if d == "PPI":
d.PatternFill = RedFill
Below are the samples data that I need to color code that contains "PPI":
enter image description here
There are few changes required...
The d == "PPI" will return false as it is not searching for a
pattern. You need to search for the substring using something like
if "PPI" in d.value
If true, you need to use d.fill = redFill, not d.PatternFill
If not already doing it, do save the file
I was able to make this work updating the code like this. Hope this helps.
for d in sheet["B:B"]:
if "PPI" in d.value:
d.fill = redFill
workbook.save('new.xlsx') ## Or whatever name you want the updated file to be in
I want to copy the data from one sheet in a workbook to another sheet in a workbook with source formatting (Cell pattern, cell border, font style, font colour, font size, header and footer).
This is the code i used, it does the work but it failed to keep the source formatting. It would be great if you guys have anything in mind that might be helpful in this topic.
SOURCE CODE:
import openpyxl as xl;
from copy import copy
wb1 = xl.load_workbook('C:\\Users\\high.xlsx')
ws1 = wb1.active
filename = 'C:\\Users\\3.0.xlsx'
wb2 = xl.load_workbook(filename)
ws2 = wb2['Session']
mr = ws1.max_row
mc = ws1.max_column
mr2 = ws2.max_row
mc2 = ws2.max_column
ws2.delete_cols(1, mc2+1)
ws2.delete_rows(1, mr2+1)
for i in range (1, mr + 1):
for j in range (1, mc + 1):
c = ws1.cell(row = i, column = j)
ws2.cell(row = i, column = j).value = c.value
wb2.save(str(filename))
There's a number of attributes on the cell object which you can copy over which have the style information. Trying to copy them directly will fail, but using the copy module will work.
Style information can also be attached to rows and columns - they seem to be attached to the dimension objects on the worksheet.
Update: Oops, looks like this has already been answered - I'll leave this here though since it includes the row and column level styles.
import openpyxl as xl
from copy import copy
style_attrs = ["alignment", "border", "fill", "font", "number_format", "protection"]
def cells(worksheet):
"""Return a generator for the sequence of cells in the worksheet"""
for row in worksheet:
for cell in row:
yield cell
def copy_attrs(src, dst, attrs=style_attrs):
"""Copy attributes from src to dst. Attributes are shallow-copied to avoid
TypeError: unhashable type: 'StyleProxy'"""
for name in attrs:
setattr(dst, name, copy(getattr(src, name)))
def copy_column_attrs(worksheet_src, worksheet_dst, attrs=style_attrs + ["width"]):
"""Copy ColumnDimension properties from worksheet_src to worksheet_dst.
Only properties listed in attrs will be copied."""
for column, dimensions in worksheet_src.column_dimensions.items():
copy_attrs(
src=dimensions,
dst=worksheet_dst.column_dimensions[column],
attrs=attrs,
)
def copy_row_attrs(worksheet_src, worksheet_dst, attrs=style_attrs + ["height"]):
"""Copy RowDimension properties from worksheet_src to worksheet_dst.
Only properties listed in attrs will be copied."""
for row, dimensions in worksheet_src.row_dimensions.items():
copy_attrs(
src=dimensions,
dst=worksheet_dst.row_dimensions[row],
attrs=style_attrs + ["height"],
)
def copy_cells(worksheet_src, worksheet_dst, attrs=style_attrs):
"""Copy cells from worksheet_src to worksheet_dst. If cells are styled
then also copy the attributes listed in attrs."""
for cell in cells(worksheet_src):
cell_dst = worksheet_dst.cell(row=cell.row, column=cell.column)
if cell.has_style:
copy_attrs(cell, cell_dst, attrs=attrs)
cell_dst.value = cell.value
def delete_worksheet_cells(worksheet):
worksheet.delete_cols(1, worksheet.max_column + 1)
worksheet.delete_rows(1, worksheet.max_row + 1)
wb_src = xl.load_workbook("a.xlsx")
ws_src = wb_src.active
wb_dst = xl.load_workbook("b.xlsx")
ws_dst = wb_dst.active
delete_worksheet_cells(ws_dst)
copy_column_attrs(ws_src, ws_dst)
copy_row_attrs(ws_src, ws_dst)
copy_cells(ws_src, ws_dst)
wb_dst.save("b.xlsx")
I want to convert an xlsx with Python. I used the modules tablib and xtopdf to build a well structured table. Works excellent! Unfortunately the content does not fit on one pdf page. So I wanted to change the pagesize and format to horizontal A3. But I don't know how that could work. My code:
import random
import tablib
from openpyxl import load_workbook
from xtopdf import PDFWriter
from pyPdf import PdfFileWriter, PdfFileReader
workbook = load_workbook('C:/Users/user1/Testexcel.xlsx', guess_types=True, data_only=True)
worksheet = workbook.get_sheet_by_name('Testsheet')
ws_range = worksheet.iter_rows('A4:H6')
# Helper function to output a string to both screen and PDF.
def print_and_write(pw, strng):
print strng
pw.writeLine(strng)
# Create an empty Dataset and set its headers.
data = tablib.Dataset()
data.headers = ['col1', 'col2', 'col3', 'col4']
widths = [30, 20, 10, 20] # Display widths for columns.
for row in ws_range:
col1 = str(row[0].value)
col2 = str(row[1].value)
col3 = str(row[2].value)
col4 = str(row[3].value)
columns = [col1, col2, col3, col4]
row = [ str(col).center(widths[idx]) for idx, col in enumerate(columns) ]
data.append(row)
# Set up the PDFWriter.
pw = PDFWriter('C:/Users/user1/Test.pdf')
pw.setFont('Courier', 10)
pw.setHeader('Test')
pw.setFooter('Test')
# Generate header and data rows as strings; output them to screen and PDF.
separator = '-' * sum(widths)
print_and_write(pw, separator)
# Output headers
header_strs = [ header.center(widths[idx]) for idx, header in enumerate(data.headers) ]
print_and_write(pw, ''.join(header_strs))
print_and_write(pw, separator)
# Output data
for row in data:
print_and_write(pw, ''.join(row))
print_and_write(pw, separator)
pw.close()
Found out that the PDFWriter from xtopdf itself instanciates an canvas object of the reportlab library. In the canvas class an attribute pagesize is declared which is setted by default to 'A4'. But if I change the entry to 'A3' the result pdf still is in 'A4'.
class Canvas(textobject._PDFColorSetter):
from reportlab.pdfgen import canvas
c = canvas.Canvas("hello.pdf")
from reportlab.lib.units import inch
# move the origin up and to the left
c.translate(inch,inch)
# define a large font
c.setFont("Helvetica", 80)
# choose some colors
c.setStrokeColorRGB(0.2,0.5,0.3)
c.setFillColorRGB(1,0,1)
# draw a rectangle
c.rect(inch,inch,6*inch,9*inch, fill=1)
# make text go straight up
c.rotate(90)
# change color
c.setFillColorRGB(0,0,0.77)
# say hello (note after rotate the y coord needs to be negative!)
c.drawString(3*inch, -3*inch, "Hello World")
c.showPage()
c.save()
"""
def __init__(self,filename,
pagesize='A3',
bottomup = 1,
pageCompression=None,
encoding = None,
invariant = None,
verbosity=0):
"""Create a canvas of a given size. etc.
You may pass a file-like object to filename as an alternative to
a string.
Most of the attributes are private - we will use set/get methods
as the preferred interface. Default page size is A4."""
if pagesize is None: pagesize = 'A3'
if encoding is None: encoding = rl_config.defaultEncoding
if invariant is None: invariant = rl_config.invariant
self._filename = filename
self._encodingName = encoding
self._doc = pdfdoc.PDFDocument(encoding,
compression=pageCompression,
invariant=invariant, filename=filename)
#this only controls whether it prints 'saved ...' - 0 disables
self._verbosity = verbosity
#this is called each time a page is output if non-null
self._onPage = None
self._pagesize = pagesize
self._pageRotation = 0
#self._currentPageHasImages = 0
self._pageTransition = None
self._pageDuration = None
self._destinations = {} # dictionary of destinations for cross indexing.
self.setPageCompression(pageCompression)
self._pageNumber = 1 # keep a count
#self3 = [] #where the current page's marking operators accumulate
# when we create a form we need to save operations not in the form
self._codeStack = []
self._restartAccumulators() # restart all accumulation state (generalized, arw)
self._annotationCount = 0
self._outlines = [] # list for a name tree
self._psCommandsBeforePage = [] #for postscript tray/font commands
self._psCommandsAfterPage = [] #for postscript tray/font commands
#PostScript has the origin at bottom left. It is easy to achieve a top-
#down coord system by translating to the top of the page and setting y
#scale to -1, but then text is inverted. So self.bottomup is used
#to also set the text matrix accordingly. You can now choose your
#drawing coordinates.
self.bottomup = bottomup
self.imageCaching = rl_config.defaultImageCaching
self._make_preamble()
self.init_graphics_state()
self.state_stack = []
edit: I think the changes in the reportlab module are not accepted by the system. Tried to remove the dictionary reportlab and tried to import it then in the commandline. Ironically it works ylthough python should not find that module anymore.
try this
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm
c = canvas.Canvas("hello.pdf", pagesize = (297 * mm, 420 * mm))
# or (420 * mm, 297 * mm) if you want it in portrait format
# values for inch: 11.69 * inch , 16.53 * inch
#the following would create an empty page
c.showPage()
c.save()
Just forked a project named xtopdf at bitbucket and made the following change:
##------------------------ PDFWriter.__init__ ----------------------------
- def __init__(self, pdf_fn):
+ def __init__(self, pdf_fn, pagesize='A4'):
'''
Constructor.
"pdf_fn" arg is the name of the PDF file to be created.
'''
self.__pdf_fn = pdf_fn # file name of PDF file
- self.__canv = canvas.Canvas(pdf_fn) # canvas to write on
+ self.__canv = canvas.Canvas(pdf_fn, pagesize) # canvas to write on
self.__font_name = None # font name
self.__font_size = None # font size
self.__header_str = None # header string (partial)
Can you try it? use pw = PDFWriter('C:/Users/user1/Test.pdf', 'A3').
In the openpyxl documentation there is an example of how to place a table into a workbook but there are no examples of how to find back the tables of a workbook. I have an XLS file that has named tables in it and I want to open the file, find all of the tables and parse them. I cannot find any documentation on how to do this. Can anyone help?
In the meantime I worked it out and wrote the following class to work with openpyxl:
class NamedArray(object):
''' Excel Named range object
Reproduces the named range feature of Microsoft Excel
Assumes a definition in the form <Worksheet PinList!$A$6:$A$52 provided by openpyxl
Written for use with, and initialised by the get_names function
After initialisation named array can be used in the same way as for VBA in excel
Written for openpyxl version 2.4.1, may not work with earlier versions
'''
C_CAPS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
def __init__(self, wb, named_range_raw):
''' Initialise a NameArray object from the named_range_raw information in the given workbook
'''
self.sheet, cellrange_str = str(named_range_raw).split('!')
self.sheet = self.sheet.replace("'",'') # remove the single quotes if they exist
self.loc = wb[self.sheet]
if ':' in cellrange_str:
self.has_range = True
self.has_value = False
lo, hi = cellrange_str.split(':')
self.ad_lo = lo.replace('$','')
self.ad_hi = hi.replace('$','')
else:
self.has_range = False
self.has_value = True
self.ad_lo = cellrange_str.replace('$','')
self.ad_hi = self.ad_lo
self.row = self.get_row(self.ad_lo)
self.max_row = self.get_row(self.ad_hi)
self.rows = self.max_row - self.row + 1
self.min_col = self.col_to_n(self.ad_lo)
self.max_col = self.col_to_n(self.ad_hi)
self.cols = self.max_col - self.min_col + 1
def size_of(self):
''' Returns two dimensional size of named space
'''
return self.cols, self.rows
def value(self, row=1, col=1):
''' Returns the value at row, col
'''
assert row <= self.rows , 'invalid row number given'
assert col <= self.cols , 'invalid column number given'
return self.loc.cell(self.n_to_col(self.min_col + col-1)+str(self.row + row-1)).value
def __str__(self):
''' printed description of named space
'''
locs = 's ' + self.ad_lo + ':' + self.ad_hi if self.is_range else ' ' + self.ad_lo
return('named range'+ str(self.size_of()) + ' in sheet ' + self.sheet + ' # location' + locs)
def __contains__(self, val):
rval = False
for row in range(1,self.rows+1):
for col in range(1,self.cols+1):
if self.value(row,col) == val:
rval = True
return rval
def vlookup(self, key, col):
''' excel style vlookup function
'''
assert col <= self.cols , 'invalid column number given'
rval = None
for row in range(1,self.rows+1):
if self.value(row,1) == key:
rval = self.value(row, col)
break
return rval
def hlookup(self, key, row):
''' excel style hlookup function
'''
assert row <= self.rows , 'invalid row number given'
rval = None
for col in range(1,self.cols+1):
if self.value(1,col) == key:
rval = self.value(row, col)
break
return rval
#classmethod
def get_row(cls, ad):
''' get row number from cell string
Cell string is assumed to be in excel format i.e "ABC123" where row is 123
'''
row = 0
for l in ad:
if l in "1234567890":
row = row*10 + int(l)
return row
#classmethod
def col_to_n(cls, ad):
''' find column number from xl address
Cell string is assumed to be in excel format i.e "ABC123" where column is abc
column number is integer represenation i.e.(A-A)*26*26 + (B-A)*26 + (C-A)
'''
n = 0
for l in ad:
if l in cls.C_CAPS:
n = n*26 + cls.C_CAPS.find(l)+1
return n
#classmethod
def n_to_col(cls, n):
''' make xl column address from column number
'''
ad = ''
while n > 0:
ad = cls.C_CAPS[n%26-1] + ad
n = n // 26
return ad
def get_names(workbook, filt='', debug=False):
''' Create a structure containing all of the names in the given workbook
filt is an optional parameter and used to create a subset of names starting with filt
useful for IO_ring_spreadsheet as all names start with 'n_'
if present, filt characters are stipped off the front of the name
'''
named_ranges = workbook.defined_names.definedName
name_list = {}
for named_range in named_ranges:
name = named_range.name
if named_range.attr_text.startswith('#REF'):
print('WARNING: named range "', name, '" is undefined')
elif filt == '' or name.startswith(filt):
name_list[name[len(filt):]] = NamedArray(workbook, named_range.attr_text)
if debug:
with open("H:\\names.txt",'w') as log:
for item in name_list:
print (item, '=', name_list[item])
log.write(item.ljust(30) + ' = ' + str(name_list[item])+'\n')
return name_list
I agree that the documentation does not really help, and the public API also seems to have only add_table() method.
But then I found an openpyxl Issue 844 asking for a better interface, and it shows that worksheet has an _tables property.
This is enough to get a list of all tables in a file, together with some basic properties:
from openpyxl import load_workbook
wb = load_workbook(filename = 'test.xlsx')
for ws in wb.worksheets:
print("Worksheet %s include %d tables:" % (ws.title, len(ws._tables)))
for tbl in ws._tables:
print(" : " + tbl.displayName)
print(" - name = " + tbl.name)
print(" - type = " + (tbl.tableType if isinstance(tbl.tableType, str) else 'n/a')
print(" - range = " + tbl.ref)
print(" - #cols = %d" % len(tbl.tableColumns))
for col in tbl.tableColumns:
print(" : " + col.name)
Note that the if/else construct is required for the tableType, since it can return NoneType (for standard tables), which is not convertible to str.
Building on #MichalKaut's answer, I created a simple function that returns a dictionary with all tables in a given workbook. It also puts each table's data into a Pandas DataFrame.
from openpyxl import load_workbook
import pandas as pd
def get_all_tables(filename):
""" Get all tables from a given workbook. Returns a dictionary of tables.
Requires a filename, which includes the file path and filename. """
# Load the workbook, from the filename, setting read_only to False
wb = load_workbook(filename=file, read_only=False, keep_vba=False, data_only=True, keep_links=False)
# Initialize the dictionary of tables
tables_dict = {}
# Go through each worksheet in the workbook
for ws_name in wb.sheetnames:
print("")
print(f"worksheet name: {ws_name}")
ws = wb[ws_name]
print(f"tables in worksheet: {len(ws.tables)}")
# Get each table in the worksheet
for tbl in ws.tables.values():
print(f"table name: {tbl.name}")
# First, add some info about the table to the dictionary
tables_dict[tbl.name] = {
'table_name': tbl.name,
'worksheet': ws_name,
'num_cols': len(tbl.tableColumns),
'table_range': tbl.ref}
# Grab the 'data' from the table
data = ws[tbl.ref]
# Now convert the table 'data' to a Pandas DataFrame
# First get a list of all rows, including the first header row
rows_list = []
for row in data:
# Get a list of all columns in each row
cols = []
for col in row:
cols.append(col.value)
rows_list.append(cols)
# Create a pandas dataframe from the rows_list.
# The first row is the column names
df = pd.DataFrame(data=rows_list[1:], index=None, columns=rows_list[0])
# Add the dataframe to the dictionary of tables
tables_dict[tbl.name]['dataframe'] = df
return tables_dict
# File location:
file = r"C:\Users\sean\spreadsheets\full_of_tables.xlsx"
# Run the function to return a dictionary of all tables in the Excel workbook
tables_dict = get_all_tables(filename=file)
The answer to this has changed.
ws objects now contain the tables accessor which acts as a dictionary. Updated answer is:
tmp = [ws.tables for ws in wb.worksheets]
tbls = [{v.name:v} for t in tmp for v in t.values()]
I'm not sure what you mean by parsing but read-support for worksheet tables has been possible since version 2.4.4. If you have questions about the details then I suggest you ask your question on the openpyxl mailing list as that is a more suitable place for this kind of discussion.
I don't think this is possible. I seems to work similarly to images; if you read and save a file with a table it will get striped.
I'm able to use a paragraph object to select font size, color, bold, etc. within a table cell. But, add_paragraph() seems to always insert a leading \n into the cell and this messes up the formatting on some tables.
If I just use the cell.text('') method it doesn't insert this newline but then I can't control the text attributes.
Is there a way to eliminate this leading newline?
Here is my function:
def add_table_cell(table, row, col, text, fontSize=8, r=0, g=0, b=0, width=-1):
cell = table.cell(row,col)
if (width!=-1):
cell.width = Inches(width)
para = cell.add_paragraph(style=None)
para.alignment = WD_ALIGN_PARAGRAPH.LEFT
run = para.add_run(text)
run.bold = False
run.font.size = Pt(fontSize)
run.font.color.type == MSO_COLOR_TYPE.RGB
run.font.color.rgb = RGBColor(r, g, b)
I tried the following and it worked out for me. Not sure if is the best approach:
cells[0].text = 'Some text' #Write the text to the cell
#Modify the paragraph alignment, first paragraph
cells[0].paragraphs[0].paragraph_format.alignment=WD_ALIGN_PARAGRAPH.CENTER
The solution that I find is to use text attribute instead of add_paragraph() but than use add_run():
row_cells[0].text = ''
row_cells[0].paragraphs[0].add_run('Total').bold = True
row_cells[0].paragraphs[0].paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
I've look through the documentation of cell, and it's not the problem of add_paragraph(). The problem is when you having a cell, by default, it will have a paragraph inside it.
class docx.table._Cell:
paragraphs: ... By default, a new cell contains a single paragraph. Read-only
Therefore, if you want to add paragraphs in the first row of cell, you should first delete the default paragraph first. Since python-docx don't have paragraph.delete(), you can use the function mention in this github issue: feature: Paragraph.delete()
def delete_paragraph(paragraph):
p = paragraph._element
p.getparent().remove(p)
p._p = p._element = None
Therefore, you should do something like:
cell = table.cell(0,0)
paragraph = cell.paragraphs[0]
delete_paragraph(paragraph)
paragraph = cell.add_paragraph('text you want to add', style='style you want')
Update at 10/8/2022
Sorry, the above approach is kinda unnecessary.
It's much intuitive to edit the default paragraph instead of first deleting it and add it back.
For the function add_table_cell, just replace the para = cell.paragraphs[0]
and para.style = None, the para.style = None is not necessary as it should be default value for a new paragraph.
Here is what worked for me. I don't call add_paragraph(). I just reference the first paragraph with this call -> para = cell.paragraphs[0]. Everything else after that is the usual api calls.
table = doc.add_table( rows=1, cols=3 ) # bar codes
for tableRow in table.rows:
for cell in tableRow.cells:
para = cell.paragraphs[0]
run = para.add_run( "*" + specIDStr + "*" )
font = run.font
font.name = 'Free 3 of 9'
font.size = Pt( 20 )
run = para.add_run( "\n" + specIDStr
+ "\n" + firstName + " " + lastName
+ "\tDOB: " + dob )
font = run.font
font.name = 'Arial'
font.size = Pt( 8 )