Python pptx module - python

I have to edit a pptx template which contain tables like this:
How to append values stored in my python dict to the empty fields?
I'm using a pptx module, but I couldn't find any example of doing this.
from pptx import Presentation
prs = Presentation('template.pptx')
slide = prs.slides[2] #<- This is the slide that contains the table
shape = slide.shapes #<- As I understood This gives access to the shapes
textframe=shape.textframe
textframe.clear()
prs.save('test.pptx') #<- Saves the new file
pptx module link

quote from the dev-group of python-ppty developer
-if you know its index, something like table = slide.shapes[2] would do the trick.
Then you'll need to navigate the cells before you can change their contents:
for idx, row in enumerate(table.rows):
if idx = 0: # skip header row
continue
name_cell = row.cells[0]
name_cell.text = 'foobar'
corners_cell = row.cells[1]

---in main----
table_data = [['ID', 'Name', 'Age', 'Second name'], ['1', 'Petro', 22, 'Petrovich'], ['2', 'Ivan', 32, 'Ivanovich'], ['3', 'Oles', 23, 'Marko']]
prs = Presentation(template_filepath)
slide_1 = slide_build(prs, 5)
table_draw(table_data, slide_1.shapes)
prs.save(result_filepath)
def slide_build(prs, layout):
slide = prs.slides.add_slide(prs.slide_layouts[layout])
return slide
def table_draw(table_data, shapes):
rows_number = 0
columns_number = 0
# get table size
rows_number = len(table_data)
for i, item in enumerate(table_data):
columns_number += 1
table = table_build(rows_number, columns_number, shapes)
column_coord = 0
row_coord = 0
for row_count, row in enumerate(table_data):
for item_count, row_item in enumerate(row):
table.cell(row_count + row_coord, item_count + column_coord).text = str(row_item)
def table_build(rows, cols, shapes):
left = (0.1)
top = Inches(0.7)
width = Inches(6.0)
height = Inches(0.8)
table = shapes.add_table(rows, cols, left, top, width, height).table
# set column widths
i = 0
while i
Some thing like this

Related

How to convert a CSV table into COCO format in python?

I have a CSV table with the following columns:
column_names = ['image_id', 'xmin', 'ymin', 'width', 'height', 'xmax','ymax']
where xmin, ymin, xmax and ymax represent the bounding box that encloses some object; width and height, the image dimensions; and image_id, the file name (.JPG file). Since I want to do object detection, I need to convert this table into COCO format. Amazingly enough, I can't find any answer to this inquiry in the internet.
I had the same issue before, then I found this code it is very helpful
you will need to change the column names to this columns and update the csv file
column_names =['filename','class','width', 'height','xmin','ymin','xmax','ymax']
then try this code
import numpy as np
import json
import pandas as pd
path = 'annotations.csv' # the path to the CSV file
save_json_path = 'traincoco.json'
data = pd.read_csv(path)
images = []
categories = []
annotations = []
category = {}
category["supercategory"] = 'none'
category["id"] = 0
category["name"] = 'None'
categories.append(category)
data['fileid'] = data['filename'].astype('category').cat.codes
data['categoryid']= pd.Categorical(data['class'],ordered= True).codes
data['categoryid'] = data['categoryid']+1
data['annid'] = data.index
def image(row):
image = {}
image["height"] = row.height
image["width"] = row.width
image["id"] = row.fileid
image["file_name"] = row.filename
return image
def category(row):
category = {}
category["supercategory"] = 'None'
category["id"] = row.categoryid
category["name"] = row[2]
return category
def annotation(row):
annotation = {}
area = (row.xmax -row.xmin)*(row.ymax - row.ymin)
annotation["segmentation"] = []
annotation["iscrowd"] = 0
annotation["area"] = area
annotation["image_id"] = row.fileid
annotation["bbox"] = [row.xmin, row.ymin, row.xmax -row.xmin,row.ymax-row.ymin ]
annotation["category_id"] = row.categoryid
annotation["id"] = row.annid
return annotation
for row in data.itertuples():
annotations.append(annotation(row))
imagedf = data.drop_duplicates(subset=['fileid']).sort_values(by='fileid')
for row in imagedf.itertuples():
images.append(image(row))
catdf = data.drop_duplicates(subset=['categoryid']).sort_values(by='categoryid')
for row in catdf.itertuples():
categories.append(category(row))
data_coco = {}
data_coco["images"] = images
data_coco["categories"] = categories
data_coco["annotations"] = annotations
json.dump(data_coco, open(save_json_path, "w"), indent=4)

How to convert text table to dataframe

I am trying to scrape the "PRINCIPAL STOCKHOLDERS" table from the linktext fileand convert it to a csv file. Right now I am only half successful. Namely, I can locate the table and parse it but somehow I cannot convert the text table to a standard one. My code is attached. Can someone help me with it?
url = r'https://www.sec.gov/Archives/edgar/data/1034239/0000950124-97-003372.txt'
# Different approach, the first approach does not work
filing_url = requests.get(url)
content = filing_url.text
splited_data = content.split('\n')
table_title = 'PRINCIPAL STOCKHOLDERS'
END_TABLE_LINE = '- ------------------------'
def find_no_line_start_table(table_title,splited_data):
found_no_lines = []
for index, line in enumerate(splited_data):
if table_title in line:
found_no_lines.append(index)
return found_no_lines
table_start = find_no_line_start_table(table_title,splited_data)
# I need help with locating the table. If I locate the table use the above function, it will return two locations and I have to manually choose the correct one.
table_start = table_start[1]
def get_start_data_table(table_start, splited_data):
for index, row in enumerate(splited_data[table_start:]):
if '<C>' in row:
return table_start + index
def get_end_table(start_table_data, splited_data ):
for index, row in enumerate(splited_data[start_table_data:]):
if END_TABLE_LINE in row:
return start_table_data + index
def row(l):
l = l.split()
number_columns = 8
if len(l) >= number_columns:
data_row = [''] * number_columns
first_column_done = False
index = 0
for w in l:
if not first_column_done:
data_row[0] = ' '.join([data_row[0], w])
if ':' in w:
first_column_done = True
else:
index += 1
data_row[index] = w
return data_row
start_line = get_start_data_table(table_start, splited_data)
end_line = get_end_table(start_line, splited_data)
table = splited_data[start_line : end_line]
# I also need help with convert the text table to a CSV file, somehow the following function does not #recognize my column.
def take_table(table):
owner = []
Num_share = []
middle = []
middle_1 = []
middle_2 = []
middle_3 = []
prior_offering = []
after_offering = []
for r in table:
data_row = row(r)
if data_row:
col_1, col_2, col_3, col_4, col_5, col_6, col_7, col_8 = data_row
owner.append(col_1)
Num_share.append(col_2)
middle.append(col_3)
middle_1.append(col_4)
middle_2.append(col_5)
middle_3.append(col_6)
prior_offering.append(col_7)
after_offering.append(col_8)
table_data = {'owner': owner, 'Num_share': Num_share, 'middle': middle, 'middle_1': middle_1,
'middle_2': middle_2, 'middle_3': middle_3, 'prior_offering': prior_offering,
'after_offering': after_offering}
return table_data
#print (table)
dict_table = take_table(table)
a = pd.DataFrame(dict_table)
a.to_csv('trail.csv')
I think what you need to do is
pd.DataFrame.from_dict(dict_table)
instead of
pd.DataFrame(dict_table)

Python - How to create an Excel Calculated Field without modifying original source of Data

I have 2 tables on Excel:
.
I've created an excel Pivot Table using Python but I could not find a simple way to create a calculated field inside it (like I would do with VB) which matches Region from left table and Region from right table.
So I did this, using the module win32com.client:
First, stored the content of the tables in two lists : myTable and myRates.
Then, added a new column to the original left table where I calculated CA * (1 + rate). The code here:
calField = [['CA Bonifié']] #first element as a title for the new column :
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
What it does in the sheet "source":
What it does in the created sheet "TCD":
Result is ok but I don't like this method as it alterates the original table. So I'm looking a simplest method to do that.
Thanks in advance for your help
PS : The whole code below. May it help.
import win32com.client
Excel = win32com.client.gencache.EnsureDispatch('Excel.Application')
win32c = win32com.client.constants
Excel.Visible = True
wb = Excel.Workbooks.Open('C:/Users/Documents/Python/classeur.xlsx')
Sheet1 = wb.Worksheets('Source')
def getContiguousRange(fichier, sheet, row, col):
bottom = row
while sheet.Cells(bottom + 1, col).Value not in [None, '']:
bottom = bottom + 1
right = col
while sheet.Cells(row, right + 1).Value not in [None, '']:
right = right + 1
return sheet.Range(sheet.Cells(row, col), sheet.Cells(bottom, right)).Value
myTable = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 1)
myRates = getContiguousRange(fichier = wb, sheet = Sheet1, row = 1, col = 8)
calField = [['CA Bonifié']]
for a, testMyTable in enumerate(myTable):
for b, testMyRates in enumerate(myRates):
if a >0 and b > 0:
if testMyTable[0] == testMyRates[0]:
calField.append( [ testMyTable[ len(testMyTable)-1 ] * ( 1+testMyRates[1] ) ] )
for i, testDataRow in enumerate(calField):
for j, testDataItem in enumerate(testDataRow):
Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
cl1 = Sheet1.Cells(1,1)
cl2 = Sheet1.Cells(len(myTable),len(myTable[0])+1)
pivotSourceRange = Sheet1.Range(cl1,cl2)
pivotSourceRange.Select()
Sheet2 = wb.Sheets.Add (After=wb.Sheets (1))
Sheet2.Name = 'TCD'
cl3=Sheet2.Cells(4,1)
pivotTargetRange= Sheet2.Range(cl3,cl3)
pivotTableName = 'tableauCroisé'
pivotCache = wb.PivotCaches().Create(SourceType=win32c.xlDatabase, SourceData=pivotSourceRange, Version=win32c.xlPivotTableVersion14)
pivotTable = pivotCache.CreatePivotTable(TableDestination=pivotTargetRange, TableName=pivotTableName, DefaultVersion=win32c.xlPivotTableVersion14)
pivotTable.PivotFields('Service').Orientation = win32c.xlRowField
pivotTable.PivotFields('Service').Position = 1
pivotTable.PivotFields('Region').Orientation = win32c.xlPageField
pivotTable.PivotFields('Region').Position = 1
pivotTable.PivotFields('Region').CurrentPage = 'IDF'
dataField = pivotTable.AddDataField(pivotTable.PivotFields('CA'))
dataField.NumberFormat = '# ### €'
calculField = pivotTable.AddDataField(pivotTable.PivotFields('CA Bonifié'))
calculField.NumberFormat = '# ### €'
# wb.SaveCopyAs('C:/Users/Documents/Python/tcd.xlsx')
# wb.Close(True)
# Excel.Application.Quit()
Note: I'm using Sheet1 as the Image show all relevant indices and its easier to verify.
You can move the Formula to the PivotTabel at a later Step, once verified.
STEP Replace Column E with the Formula =VLOOKUP
Reference: how-to-use-vlookup-match
Replace the following in your Code:
for row, testDataRow in enumerate(calField, 2):
#Sheet1.Cells(i+1,len(testMyTable)+1).Value = testDataItem
Sheet1.Cells(row, 5).Formula = '=VLOOKUP(A{}, H1:I5, MATCH(H1,H1:I1))'.format(row)
The Result should show the matching Taux!
Come back and confirm Results are OK!
STEP Compute Taux

(Python) Change pagesize and format of PDF file generated with xtopdf

I want to convert an xlsx with Python. I used the modules tablib and xtopdf to build a well structured table. Works excellent! Unfortunately the content does not fit on one pdf page. So I wanted to change the pagesize and format to horizontal A3. But I don't know how that could work. My code:
import random
import tablib
from openpyxl import load_workbook
from xtopdf import PDFWriter
from pyPdf import PdfFileWriter, PdfFileReader
workbook = load_workbook('C:/Users/user1/Testexcel.xlsx', guess_types=True, data_only=True)
worksheet = workbook.get_sheet_by_name('Testsheet')
ws_range = worksheet.iter_rows('A4:H6')
# Helper function to output a string to both screen and PDF.
def print_and_write(pw, strng):
print strng
pw.writeLine(strng)
# Create an empty Dataset and set its headers.
data = tablib.Dataset()
data.headers = ['col1', 'col2', 'col3', 'col4']
widths = [30, 20, 10, 20] # Display widths for columns.
for row in ws_range:
col1 = str(row[0].value)
col2 = str(row[1].value)
col3 = str(row[2].value)
col4 = str(row[3].value)
columns = [col1, col2, col3, col4]
row = [ str(col).center(widths[idx]) for idx, col in enumerate(columns) ]
data.append(row)
# Set up the PDFWriter.
pw = PDFWriter('C:/Users/user1/Test.pdf')
pw.setFont('Courier', 10)
pw.setHeader('Test')
pw.setFooter('Test')
# Generate header and data rows as strings; output them to screen and PDF.
separator = '-' * sum(widths)
print_and_write(pw, separator)
# Output headers
header_strs = [ header.center(widths[idx]) for idx, header in enumerate(data.headers) ]
print_and_write(pw, ''.join(header_strs))
print_and_write(pw, separator)
# Output data
for row in data:
print_and_write(pw, ''.join(row))
print_and_write(pw, separator)
pw.close()
Found out that the PDFWriter from xtopdf itself instanciates an canvas object of the reportlab library. In the canvas class an attribute pagesize is declared which is setted by default to 'A4'. But if I change the entry to 'A3' the result pdf still is in 'A4'.
class Canvas(textobject._PDFColorSetter):
from reportlab.pdfgen import canvas
c = canvas.Canvas("hello.pdf")
from reportlab.lib.units import inch
# move the origin up and to the left
c.translate(inch,inch)
# define a large font
c.setFont("Helvetica", 80)
# choose some colors
c.setStrokeColorRGB(0.2,0.5,0.3)
c.setFillColorRGB(1,0,1)
# draw a rectangle
c.rect(inch,inch,6*inch,9*inch, fill=1)
# make text go straight up
c.rotate(90)
# change color
c.setFillColorRGB(0,0,0.77)
# say hello (note after rotate the y coord needs to be negative!)
c.drawString(3*inch, -3*inch, "Hello World")
c.showPage()
c.save()
"""
def __init__(self,filename,
pagesize='A3',
bottomup = 1,
pageCompression=None,
encoding = None,
invariant = None,
verbosity=0):
"""Create a canvas of a given size. etc.
You may pass a file-like object to filename as an alternative to
a string.
Most of the attributes are private - we will use set/get methods
as the preferred interface. Default page size is A4."""
if pagesize is None: pagesize = 'A3'
if encoding is None: encoding = rl_config.defaultEncoding
if invariant is None: invariant = rl_config.invariant
self._filename = filename
self._encodingName = encoding
self._doc = pdfdoc.PDFDocument(encoding,
compression=pageCompression,
invariant=invariant, filename=filename)
#this only controls whether it prints 'saved ...' - 0 disables
self._verbosity = verbosity
#this is called each time a page is output if non-null
self._onPage = None
self._pagesize = pagesize
self._pageRotation = 0
#self._currentPageHasImages = 0
self._pageTransition = None
self._pageDuration = None
self._destinations = {} # dictionary of destinations for cross indexing.
self.setPageCompression(pageCompression)
self._pageNumber = 1 # keep a count
#self3 = [] #where the current page's marking operators accumulate
# when we create a form we need to save operations not in the form
self._codeStack = []
self._restartAccumulators() # restart all accumulation state (generalized, arw)
self._annotationCount = 0
self._outlines = [] # list for a name tree
self._psCommandsBeforePage = [] #for postscript tray/font commands
self._psCommandsAfterPage = [] #for postscript tray/font commands
#PostScript has the origin at bottom left. It is easy to achieve a top-
#down coord system by translating to the top of the page and setting y
#scale to -1, but then text is inverted. So self.bottomup is used
#to also set the text matrix accordingly. You can now choose your
#drawing coordinates.
self.bottomup = bottomup
self.imageCaching = rl_config.defaultImageCaching
self._make_preamble()
self.init_graphics_state()
self.state_stack = []
edit: I think the changes in the reportlab module are not accepted by the system. Tried to remove the dictionary reportlab and tried to import it then in the commandline. Ironically it works ylthough python should not find that module anymore.
try this
from reportlab.pdfgen import canvas
from reportlab.lib.units import mm
c = canvas.Canvas("hello.pdf", pagesize = (297 * mm, 420 * mm))
# or (420 * mm, 297 * mm) if you want it in portrait format
# values for inch: 11.69 * inch , 16.53 * inch
#the following would create an empty page
c.showPage()
c.save()
Just forked a project named xtopdf at bitbucket and made the following change:
##------------------------ PDFWriter.__init__ ----------------------------
- def __init__(self, pdf_fn):
+ def __init__(self, pdf_fn, pagesize='A4'):
'''
Constructor.
"pdf_fn" arg is the name of the PDF file to be created.
'''
self.__pdf_fn = pdf_fn # file name of PDF file
- self.__canv = canvas.Canvas(pdf_fn) # canvas to write on
+ self.__canv = canvas.Canvas(pdf_fn, pagesize) # canvas to write on
self.__font_name = None # font name
self.__font_size = None # font size
self.__header_str = None # header string (partial)
Can you try it? use pw = PDFWriter('C:/Users/user1/Test.pdf', 'A3').

Python-PPTX: Changing table style or adding borders to cells

I've started putting together some code to take Pandas data and put it into a PowerPoint slide. The template I'm using defaults to Medium Style 2 - Accent 1 which would be fine as changing the font and background are fairly easy, but there doesn't appear to be an implemented portion to python-pptx that allows for changing cell borders. Below is my code, open to any solution. (Altering the XML or changing the template default to populate a better style would be good options for me, but haven't found good documentation on how to do either). Medium Style 4 would be ideal for me as it has exactly the borders I'm looking for.
import pandas
import numpy
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
#Template Location
tmplLoc = 'C:/Desktop/'
#Read in Template
prs = Presentation(tmplLoc+'Template.pptx')
#Import data as Pandas Dataframe - dummy data for now
df = pandas.DataFrame(numpy.random.randn(10,10),columns=list('ABCDEFGHIJ'))
#Determine Table Header
header = list(df.columns.values)
#Determine rows and columns
in_rows = df.shape[0]
in_cols = df.shape[1]
#Insert table from C1 template
slide_layout = prs.slide_layouts[11]
slide = prs.slides.add_slide(slide_layout)
#Set slide title
title_placeholder = slide.shapes.title
title_placeholder.text = "Slide Title"
#Augment placeholder to be a table
placeholder = slide.placeholders[1]
graphic_frame = placeholder.insert_table(rows = in_rows+1, cols = in_cols)
table = graphic_frame.table
#table.apply_style = 'MediumStyle4'
#table.apply_style = 'D7AC3CCA-C797-4891-BE02-D94E43425B78'
#Set column widths
table.columns[0].width = Inches(2.23)
table.columns[1].width = Inches(0.9)
table.columns[2].width = Inches(0.6)
table.columns[3].width = Inches(2)
table.columns[4].width = Inches(0.6)
table.columns[5].width = Inches(0.6)
table.columns[6].width = Inches(0.6)
table.columns[7].width = Inches(0.6)
table.columns[8].width = Inches(0.6)
table.columns[9].width = Inches(0.6)
#total_width = 2.23+0.9+0.6+2+0.6*6
#Insert data into table
for rows in xrange(in_rows+1):
for cols in xrange(in_cols):
#Write column titles
if rows == 0:
table.cell(rows, cols).text = header[cols]
table.cell(rows, cols).text_frame.paragraphs[0].font.size=Pt(14)
table.cell(rows, cols).text_frame.paragraphs[0].font.color.rgb = RGBColor(255, 255, 255)
table.cell(rows, cols).fill.solid()
table.cell(rows, cols).fill.fore_color.rgb=RGBColor(0, 58, 111)
#Write rest of table entries
else:
table.cell(rows, cols).text = str("{0:.2f}".format(df.iloc[rows-1,cols]))
table.cell(rows, cols).text_frame.paragraphs[0].font.size=Pt(10)
table.cell(rows, cols).text_frame.paragraphs[0].font.color.rgb = RGBColor(0, 0, 0)
table.cell(rows, cols).fill.solid()
table.cell(rows, cols).fill.fore_color.rgb=RGBColor(255, 255, 255)
#Write Table to File
prs.save('C:/Desktop/test.pptx')
Maybe not really clean code but allowed me to adjust all borders of all cells in a table:
from pptx.oxml.xmlchemy import OxmlElement
def SubElement(parent, tagname, **kwargs):
element = OxmlElement(tagname)
element.attrib.update(kwargs)
parent.append(element)
return element
def _set_cell_border(cell, border_color="000000", border_width='12700'):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
for lines in ['a:lnL','a:lnR','a:lnT','a:lnB']:
ln = SubElement(tcPr, lines, w=border_width, cap='flat', cmpd='sng', algn='ctr')
solidFill = SubElement(ln, 'a:solidFill')
srgbClr = SubElement(solidFill, 'a:srgbClr', val=border_color)
prstDash = SubElement(ln, 'a:prstDash', val='solid')
round_ = SubElement(ln, 'a:round')
headEnd = SubElement(ln, 'a:headEnd', type='none', w='med', len='med')
tailEnd = SubElement(ln, 'a:tailEnd', type='none', w='med', len='med')
Based on this post: https://groups.google.com/forum/#!topic/python-pptx/UTkdemIZICw
In case someone else comes across this issue again, some changes should be made to the solution posted by JuuLes87 to avoid that Microsoft Office PowerPoint requires to repair the generated presentation.
After carefully inspecting the xml string of the table generated by pptx, I found that the requirement to repair the presentation seemed to be due to the duplicated nodes of 'a:lnL' or 'a:lnR' or 'a:lnT' or 'a:lnB' in the children elements of 'a:tcPr'. So we only need to remove nodes of ['a:lnL','a:lnR','a:lnT','a:lnB'] before these nodes are inserted as below.
from pptx.oxml.xmlchemy import OxmlElement
def SubElement(parent, tagname, **kwargs):
element = OxmlElement(tagname)
element.attrib.update(kwargs)
parent.append(element)
return element
def _set_cell_border(cell, border_color="000000", border_width='12700'):
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
for lines in ['a:lnL','a:lnR','a:lnT','a:lnB']:
# Every time before a node is inserted, the nodes with the same tag should be removed.
tag = lines.split(":")[-1]
for e in tcPr.getchildren():
if tag in str(e.tag):
tcPr.remove(e)
# end
ln = SubElement(tcPr, lines, w=border_width, cap='flat', cmpd='sng', algn='ctr')
solidFill = SubElement(ln, 'a:solidFill')
srgbClr = SubElement(solidFill, 'a:srgbClr', val=border_color)
prstDash = SubElement(ln, 'a:prstDash', val='solid')
round_ = SubElement(ln, 'a:round')
headEnd = SubElement(ln, 'a:headEnd', type='none', w='med', len='med')
tailEnd = SubElement(ln, 'a:tailEnd', type='none', w='med', len='med')
I had a hard time figuring out why this wasn't working. For anyone else struggling with this, I had to add the following to the end of the function:
return cell
When using, you want to use the function as such:
cell = _set_cell_border(cell)

Categories