What is an efficient way to generate PDF for data frames in Pandas?
First plot table with matplotlib then generate pdf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
df = pd.DataFrame(np.random.random((10,3)), columns = ("col 1", "col 2", "col 3"))
#https://stackoverflow.com/questions/32137396/how-do-i-plot-only-a-table-in-matplotlib
fig, ax =plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=df.values,colLabels=df.columns,loc='center')
#https://stackoverflow.com/questions/4042192/reduce-left-and-right-margins-in-matplotlib-plot
pp = PdfPages("foo.pdf")
pp.savefig(fig, bbox_inches='tight')
pp.close()
reference:
How do I plot only a table in Matplotlib?
Reduce left and right margins in matplotlib plot
Here is how I do it from sqlite database using sqlite3, pandas and pdfkit
import pandas as pd
import pdfkit as pdf
import sqlite3
con=sqlite3.connect("baza.db")
df=pd.read_sql_query("select * from dobit", con)
df.to_html('/home/linux/izvestaj.html')
nazivFajla='/home/linux/pdfPrintOut.pdf'
pdf.from_file('/home/linux/izvestaj.html', nazivFajla)
Well one way is to use markdown. You can use df.to_html(). This converts the dataframe into a html table. From there you can put the generated html into a markdown file (.md) (see http://daringfireball.net/projects/markdown/basics). From there, there are utilities to convert markdown into a pdf (https://www.npmjs.com/package/markdown-pdf).
One all-in-one tool for this method is to use Atom text editor (https://atom.io/). There you can use an extension, search "markdown to pdf", which will make the conversion for you.
Note: When using to_html() recently I had to remove extra '\n' characters for some reason. I chose to use Atom -> Find -> '\n' -> Replace "".
Overall this should do the trick!
With reference to these two examples that I found useful:
Apply CSS class to Pandas DataFrame using to_html
https://pbpython.com/pdf-reports.html
The simple CSS code saved in same folder as ipynb:
/* includes alternating gray and white with on-hover color */
.mystyle {
font-size: 11pt;
font-family: Arial;
border-collapse: collapse;
border: 1px solid silver;
}
.mystyle td, th {
padding: 5px;
}
.mystyle tr:nth-child(even) {
background: #E0E0E0;
}
.mystyle tr:hover {
background: silver;
cursor: pointer;
}
The python code:
pdf_filepath = os.path.join(folder,file_pdf)
demo_df = pd.DataFrame(np.random.random((10,3)), columns = ("col 1", "col 2", "col 3"))
table=demo_df.to_html(classes='mystyle')
html_string = f'''
<html>
<head><title>HTML Pandas Dataframe with CSS</title></head>
<link rel="stylesheet" type="text/css" href="df_style.css"/>
<body>
{table}
</body>
</html>
'''
HTML(string=html_string).write_pdf(pdf_filepath, stylesheets=["df_style.css"])
This is a solution with an intermediate pdf file.
The table is pretty printed with some minimal css.
The pdf conversion is done with weasyprint. You need to pip install weasyprint.
# Create a pandas dataframe with demo data:
import pandas as pd
demodata_csv = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
df = pd.read_csv(demodata_csv)
# Pretty print the dataframe as an html table to a file
intermediate_html = '/tmp/intermediate.html'
to_html_pretty(df,intermediate_html,'Iris Data')
# if you do not want pretty printing, just use pandas:
# df.to_html(intermediate_html)
# Convert the html file to a pdf file using weasyprint
import weasyprint
out_pdf= '/tmp/demo.pdf'
weasyprint.HTML(intermediate_html).write_pdf(out_pdf)
# This is the table pretty printer used above:
def to_html_pretty(df, filename='/tmp/out.html', title=''):
'''
Write an entire dataframe to an HTML file
with nice formatting.
Thanks to #stackoverflowuser2010 for the
pretty printer see https://stackoverflow.com/a/47723330/362951
'''
ht = ''
if title != '':
ht += '<h2> %s </h2>\n' % title
ht += df.to_html(classes='wide', escape=False)
with open(filename, 'w') as f:
f.write(HTML_TEMPLATE1 + ht + HTML_TEMPLATE2)
HTML_TEMPLATE1 = '''
<html>
<head>
<style>
h2 {
text-align: center;
font-family: Helvetica, Arial, sans-serif;
}
table {
margin-left: auto;
margin-right: auto;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 5px;
text-align: center;
font-family: Helvetica, Arial, sans-serif;
font-size: 90%;
}
table tbody tr:hover {
background-color: #dddddd;
}
.wide {
width: 90%;
}
</style>
</head>
<body>
'''
HTML_TEMPLATE2 = '''
</body>
</html>
'''
Thanks to #stackoverflowuser2010 for the pretty printer, see stackoverflowuser2010's answer https://stackoverflow.com/a/47723330/362951
I did not use pdfkit, because I had some problems with it on a headless machine. But weasyprint is great.
when using Matplotlib, here's how to get a prettier table with alternating colors for the rows, etc. as well as to optionally paginate the PDF:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def _draw_as_table(df, pagesize):
alternating_colors = [['white'] * len(df.columns), ['lightgray'] * len(df.columns)] * len(df)
alternating_colors = alternating_colors[:len(df)]
fig, ax = plt.subplots(figsize=pagesize)
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=df.values,
rowLabels=df.index,
colLabels=df.columns,
rowColours=['lightblue']*len(df),
colColours=['lightblue']*len(df.columns),
cellColours=alternating_colors,
loc='center')
return fig
def dataframe_to_pdf(df, filename, numpages=(1, 1), pagesize=(11, 8.5)):
with PdfPages(filename) as pdf:
nh, nv = numpages
rows_per_page = len(df) // nh
cols_per_page = len(df.columns) // nv
for i in range(0, nh):
for j in range(0, nv):
page = df.iloc[(i*rows_per_page):min((i+1)*rows_per_page, len(df)),
(j*cols_per_page):min((j+1)*cols_per_page, len(df.columns))]
fig = _draw_as_table(page, pagesize)
if nh > 1 or nv > 1:
# Add a part/page number at bottom-center of page
fig.text(0.5, 0.5/pagesize[0],
"Part-{}x{}: Page-{}".format(i+1, j+1, i*nv + j + 1),
ha='center', fontsize=8)
pdf.savefig(fig, bbox_inches='tight')
plt.close()
Use it as follows:
dataframe_to_pdf(df, 'test_1.pdf')
dataframe_to_pdf(df, 'test_6.pdf', numpages=(3, 2))
Explanation of the code is here:
https://levelup.gitconnected.com/how-to-write-a-pandas-dataframe-as-a-pdf-5cdf7d525488
Related
Getting a red cross while trying to embed a png file created through pyplot in email
The code I am using is here below. What can I add in the current html to get rid of this
I need to add the image in the email body
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(
query['dt'],
query['cnt'],
marker = 'o',
markerfacecolor = 'DeepSkyBlue',
linewidth = 3,
color = 'MediumSlateBlue' )
ax.set(xlabel = 'D A T E', ylabel = 'C O U N T')
plt.setp(ax.get_xticklabels(), rotation = 90)
plt.savefig("C:/Users/quratulain.zulfiqar/Desktop/f.png", bbox_inches = "tight")
body = """<html>
<head>
<style>
#customers {
font-family: "proxima-nova", sans-serif;
border-collapse: collapse;
width: 100%;
}
#customers td, #customers th {
border: 1px solid #CB3D57;
padding: 8px;
}
#customers tr:nth-child(even){background-color: #CB3D57;}
#customers tr:hover {background-color: #CB3D57;}
#customers th {
padding-top: 12px;
padding-bottom: 10px;
text-align: center;
background-color: #CB3D57;
color: red;
}
</style>
</head>
<body>
<span style="text-align:center;"> <u><h3>"""+"AGENT MAPPING COUNT"+"""</h3></u> </span>
<div><img src= "cid:C:/Users/quratulain.zulfiqar/Desktop/f.png" ></img></div>
<p><p>
<div style='vertical-align:middle; display:att;'>
<p><p>
<p><p>
<p>Regards,<p>
</body>
</html>
"""
# In[288]:
msg1 = MIMEMultipart('alternative')
msg1['Subject'] = email_subject
msg1["From"]=from_label
msg1["To"]=to_label
msg1["CC"]=CC
file_list=[]
for fl in file_list:
with open(fl, "rb") as fil:
part = MIMEApplication(fil.read(),Name=basename(fl))
part['Content-Disposition'] = 'attachment; filename="%s"' % basename(fl)
msg1.attach(part)
part1 = MIMEText(body, 'html')
msg1.attach(part1)
with open('C:/Users/quratulain.zulfiqar/Desktop/f.png', 'rb') as f:
# set attachment mime and file name, the image type is png
mime = MIMEBase('image','png', filename='f.png')
# add required header data:
mime.add_header('Content-Disposition', 'attachment', filename='f.png')
mime.add_header('X-Attachment-Id', '0')
mime.add_header('Content-ID', '<0>')
mime.set_payload(f.read())
encode_base64(mime)
msg1.attach(mime)
server = smtplib.SMTP(email_server, email_port)
server.sendmail(from_email,to_emails, msg1.as_string())
server.quit()
I know there are other ways but since we are already this format which we were following so I need to stick to that and update it here.
The "cid:" string that you send in the email has to exactly match the Content-ID: header in the image attachment (well, the Content-ID gets angle brackets). That's how the email app makes the connection. You are specifying a path on YOUR computer for the cid, which is going to mean absolutely nothing when it is rendered on someone else's computer.
embedding image in html email
Changing the img src to the attached image name worked as #tim mentioned
<div><img src= "cid:f.png"></img></div>
I have a script in Python which connects to SQL using pyodbc and returns a set of values from a calendar for the 30 days following today. I prototyped it by using the print('') function to generate the HTML for the file I was creating then copying and pasting it in to an HTML file with Notepad++ and I know the HTML is sound and will be good for its purpose. However when it comes to generating the file I'm running aground with including the SQL results in the variable that is passed to the file writer.
I have tried both {variable} and %v methods which just seem to be either erroring out with;
unsupported format character ';' (0x3b) at index 1744
in the case of %, or in the case of {inset} is just including the word rather than the var. below is the code I have in JN;
from os import getenv
import pyodbc
cnxn = pyodbc.connect('DRIVER={ODBC Driver 13 for SQL Server};SERVER=MYSERVER\SQLEXPRESS;DATABASE=MyTable;UID=test;PWD=t')
f = open('tes.html','w')
cursor = cnxn.cursor()
cursor.execute('DECLARE #today as date SET #today = GetDate() SELECT style112, day, month, year, dayofweek, showroom_name, isbusy from ShowroomCal where Date Between #today and dateadd(month,1,#today) ')
row = cursor.fetchone()
while row is not None:
inset = ('<div class="',row.isbusy,'">',row.day,'</div>')
row = cursor.fetchone()
html_str = """
<html lang="en" ><head><meta charset="UTF-8"><title>Calendar</title>
<link rel=\'stylesheet prefetch\' href=\'https://netdna.bootstrapcdn.com/font-awesome/3.2.1/css/font-awesome.css\'>
<style>
body{background-color: #ffffff;}
a{color:#462955; text-decoration: none; display: block;}a:hover{color:#ffffff; text-decoration: none; display: block;}#yes a {color:#ffffff !important; text-decoration: none; display: block;}#yes a:hover {color:#ffffff !important; text-decoration: none; display: block;}
#calendar{margin-left: auto;margin-right: auto;width: 800px;font-family: \'Lato\', sans-serif;}
#calendar_weekdays div{display:inline-block;vertical-align:top;}
#calendar_content, #calendar_weekdays, #calendar_header{position: relative;width: 800px;overflow: hidden;float: left;z-index: 10;}
#calendar_weekdays div, #calendar_content div{width: 25px;height: 25px;overflow: hidden;text-align: center;background-color: #FFFFFF;color: #787878;}
.Yes{background-color: #990000 !important;color: #CDCDCD !important;}
.None{background-color: #ffffff !Important;color: #462955 !important;}
.None:hover{background-color: #462955 !Important;color: #ffffff !important;}
.wend{background-color: #676767 !important;color: #999999 !important;}
#calendar_content{background-colour: #ff0000;-webkit-border-radius: 0px 0px 12px 12px;-moz-border-radius: 0px 0px 12px 12px; border-radius: 0px 0px 12px 12px;}
#calendar_content div{float: left;}
#yes {background-color: #ff0000 !important;}
#calendar_content div:hover{background-color: #F8F8F8;}
#calendar_content div.blank{background-color: #E8E8E8;}
#calendar_header, #calendar_content div.today{zoom: 1;filter: alpha(opacity=70);opacity: 0.7;}
#calendar_content div.today{color: #FFFFFF;}
#calendar_header{width: 100%;height: 25px;text-align: center;background-color: #FF6860;padding: 8px 0;-webkit-border-radius: 12px 12px 0px 0px;-moz-border-radius: 12px 12px 0px 0px; border-radius: 12px 12px 0px 0px;}
#calendar_header h1{font-size: 1.5em;color: #FFFFFF;float:left;width:70%;
i[class^=icon-chevron]{color: #FFFFFF;float: left;width:15%;border-radius: 50%;}
</style>
<link href=\'https://fonts.googleapis.com/css?family=Lato\' rel=\'stylesheet\' type=\'text/css\'>
</head><base target="_parent">
<div id="calendar"><div id="calendar_header"><h1>07 2018</h1></div><div id="calendar_weekdays"></div><div id="calendar_content">
{inset}
</div></div><script src=\'jquery.min.js\'></script>
<script>
$(function(){function c(){p();var e=h();var r=0;var u=false;l.empty();while(!u){if(s[r]==e[0].weekday){u=true}else{l.append(\'<div class="blank"></div>\');r++}}for(var c=0;c<42-r;c++){if(c>=e.length){l.append(\'<div class="blank"></div>\')}else{var v=e[c].day;var m=g(new Date(t,n-1,v))?\'<div class="today">\':"<div>";l.append(m+""+v+"</div>")}}var y=o[n-1];a.css("background-color",y).find("h1").text(i[n-1]+" "+t);f.find("div").css("color",y);l.find(".today").css("background-color",y);d()}function h(){var e=[];for(var r=1;r<v(t,n)+1;r++){e.push({day:r,weekday:s[m(t,n,r)]})}return e}function p(){f.empty();for(var e=0;e<7;e++){f.append("<div>"+s[e].substring(0,3)+"</div>")}}function d(){var t;var n=$("#calendar").css("width",e+"px");n.find(t="#calendar_weekdays, #calendar_content").css("width",e+"px").find("div").css({width:e/7+"px",height:e/14+"px","line-height":e/14+"px"});n.find("#calendar_header").css({height:e*(1/14)+"px"}).find(\'i[class^="icon-chevron"]\').css("line-height",e*(1/14)+"px")}function v(e,t){return(new Date(e,t,0)).getDate()}function m(e,t,n){return(new Date(e,t-1,n)).getDay()}function g(e){return y(new Date)==y(e)}function y(e){return e.getFullYear()+"/"+(e.getMonth()+1)+"/"+e.getDate()}function b(){var e=new Date;t=e.getFullYear();n=e.getMonth()+1}var e=700;var t=2018;var n=9;var r=[];var i=["JANUARY","FEBRUARY","MARCH","APRIL","MAY","JUNE","JULY","AUGUST","SEPTEMBER","OCTOBER","NOVEMBER","DECEMBER"];var s=["Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"];var o=["#462955","#462955","#462955","#462955","#462955","#462955","#462955","#462955","#462955","#462955","#462955","#462955"];var u=$("#calendar");var a=u.find("#calendar_header");var f=u.find("#calendarweekdays");var l=u.find("#calendarcontent");b();c();a.find(\'i[class^="icon-chevron"]\').on("click",function(){var e=$(this);var r=function(e){n=e=="next"?n+1:n-1;if(n<1){n=12;t--}else if(n>12){n=1;t++}c()};if(e.attr("class").indexOf("left")!=-1){r("previous")}else{r("next")}})})
function updateValue(val, event) {document.getElementById("field17").value = val;event.preventDefault();}
</script>
</body></html><wehavechangedit>
"""
cnxn.close()
f.write(html_str)
f.close()
Can anyone point me in the direction of a better way to include the variables? Do I need to have the inset as an array for this model?
It's Py3.6, on Windows 10.
Have you tried to just save your html_str inside a template .html file, write your inset lines into a long string, then read your file into a string, do the replace, then re-write the file?
with open('C:\\template.html') as file:
wholefile = file.readlines()
use this to make a string of your results.
inset = inset + '<div class="'+ str(row.isbusy) + '">' + str(row.day) + '</div>' + '\n'
and then do the replace, so you will have the complete file in a string, then write it back out.
wholefile.replace('{inset}',inset)
I was following the style guide for pandas and it worked pretty well.
How can I keep these styles using the to_html command through Outlook? The documentation seems a bit lacking for me.
(df.style
.format(percent)
.applymap(color_negative_red, subset=['col1', 'col2'])
.set_properties(**{'font-size': '9pt', 'font-family': 'Calibri'})
.bar(subset=['col4', 'col5'], color='lightblue'))
import win32com.client as win32
outlook = win32.Dispatch('outlook.application')
mail = outlook.CreateItem(0)
mail.Subject = subject_name
mail.HTMLbody = ('<html><body><p><body style="font-size:11pt;
font-family:Calibri">Hello,</p> + '<p>Title of Data</p>' + df.to_html(
index=False, classes=????????) '</body></html>')
mail.send
The to_html documentation shows that there is a classes command that I can put inside of the to_html method, but I can't figure it out. It also seems like my dataframe does not carry the style that I specified up top.
If I try:
df = (df.style
.format(percent)
.applymap(color_negative_red, subset=['col1', 'col2'])
.set_properties(**{'font-size': '9pt', 'font-family': 'Calibri'})
.bar(subset=['col4', 'col5'], color='lightblue'))
Then df is now a Style object and you can't use to_html.
Edit - this is what I am currently doing to modify my tables. This works, but I can't keep the cool features of the .style method that pandas offers.
email_paragraph = """
<body style= "font-size:11pt; font-family:Calibri; text-align:left; margin: 0px auto" >
"""
email_caption = """
<body style= "font-size:10pt; font-family:Century Gothic; text-align:center; margin: 0px auto" >
"""
email_style = '''<style type="text/css" media="screen" style="width:100%">
table, th, td {border: 0px solid black; background-color: #eee; padding: 10px;}
th {background-color: #C6E2FF; color:black; font-family: Tahoma;font-size : 13; text-align: center;}
td {background-color: #fff; padding: 10px; font-family: Calibri; font-size : 12; text-align: center;}
</style>'''
Once you add style to your chained assignments you are operating on a Styler object. That object has a render method to get the html as a string. So in your example, you could do something like this:
html = (
df.style
.format(percent)
.applymap(color_negative_red, subset=['col1', 'col2'])
.set_properties(**{'font-size': '9pt', 'font-family': 'Calibri'})
.bar(subset=['col4', 'col5'], color='lightblue')
.render()
)
Then include the html in your email instead of a df.to_html().
It's not an extravagant / pythonic solution. I inserted the link to a direct css file before the html code made by the to_html () method, then I saved the whole string as an html file. This worked well for me.
dphtml = r'<link rel="stylesheet" type="text/css" media="screen" href="css-table.css" />' + '\n'
dphtml += dp.to_html()
with open('datatable.html','w') as f:
f.write(dphtml)
Selecting the table (the rendered, styled, dataframe widgets in jupyter) and copy-pasting to an email body worked for me (using outlook office).
No manual html extraction, saving, loading, or anything like that.
I have a directory with many HTML documents. Most of them contain the codeblock
.org-link {
/* org-link */
color: #b58900;
font-weight: bold;
text-decoration: underline;
}
inside the <style type="text/css"> tag. I'd like to write a script that removes the line text-decoration: underline; and changes the color to #2aa198 from this block in every file.
Is it possible to accomplish this with python?
You could use regular expressions to make the necessary replacements as follows:
import re
test = """
.org-link {
/* org-link */
color: #b58900;
font-weight: bold;
text-decoration: underline;
}
"""
def fix(org_link):
new_color = re.sub(r'(.*?color\s*?:\s*?)(.*?)(;)', r'\1#777\3', org_link.group(0), flags=re.S)
return re.sub(r'(.*?)(\s+?text-decoration: underline;)(.*?)', r'\1\3', new_color, flags=re.S)
print re.sub(r'(org-link\s+\{.*\})', fix, test, flags=re.S)
This would convert the text as follows:
.org-link {
/* org-link */
color:#777;
font-weight: bold;
}
It works by first identifying suitable org-link blocks and then first replacing the color and then removing any text-decoration entries.
The script could then be extended to carry this out on all of the HTML files in a given folder as follows:
import re
import glob
def fix(org_link):
new_color = re.sub(r'(.*?color\s*?:\s*?)(.*?)(;)', r'\1#777\3', org_link.group(0), flags=re.S)
return re.sub(r'(.*?)(\s+?text-decoration: underline;)(.*?)', r'\1\3', new_color, flags=re.S)
for html_file in glob.glob('*.html'):
print html_file
with open(html_file) as f_input:
html = re.sub(r'(org-link\s+\{.*\})', fix, f_input.read(), flags=re.S)
with open(html_file, 'w') as f_output:
f_output.write(html)
Tested using Python 2.7.9
What is an efficient way to generate PDF for data frames in Pandas?
First plot table with matplotlib then generate pdf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
df = pd.DataFrame(np.random.random((10,3)), columns = ("col 1", "col 2", "col 3"))
#https://stackoverflow.com/questions/32137396/how-do-i-plot-only-a-table-in-matplotlib
fig, ax =plt.subplots(figsize=(12,4))
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=df.values,colLabels=df.columns,loc='center')
#https://stackoverflow.com/questions/4042192/reduce-left-and-right-margins-in-matplotlib-plot
pp = PdfPages("foo.pdf")
pp.savefig(fig, bbox_inches='tight')
pp.close()
reference:
How do I plot only a table in Matplotlib?
Reduce left and right margins in matplotlib plot
Here is how I do it from sqlite database using sqlite3, pandas and pdfkit
import pandas as pd
import pdfkit as pdf
import sqlite3
con=sqlite3.connect("baza.db")
df=pd.read_sql_query("select * from dobit", con)
df.to_html('/home/linux/izvestaj.html')
nazivFajla='/home/linux/pdfPrintOut.pdf'
pdf.from_file('/home/linux/izvestaj.html', nazivFajla)
Well one way is to use markdown. You can use df.to_html(). This converts the dataframe into a html table. From there you can put the generated html into a markdown file (.md) (see http://daringfireball.net/projects/markdown/basics). From there, there are utilities to convert markdown into a pdf (https://www.npmjs.com/package/markdown-pdf).
One all-in-one tool for this method is to use Atom text editor (https://atom.io/). There you can use an extension, search "markdown to pdf", which will make the conversion for you.
Note: When using to_html() recently I had to remove extra '\n' characters for some reason. I chose to use Atom -> Find -> '\n' -> Replace "".
Overall this should do the trick!
With reference to these two examples that I found useful:
Apply CSS class to Pandas DataFrame using to_html
https://pbpython.com/pdf-reports.html
The simple CSS code saved in same folder as ipynb:
/* includes alternating gray and white with on-hover color */
.mystyle {
font-size: 11pt;
font-family: Arial;
border-collapse: collapse;
border: 1px solid silver;
}
.mystyle td, th {
padding: 5px;
}
.mystyle tr:nth-child(even) {
background: #E0E0E0;
}
.mystyle tr:hover {
background: silver;
cursor: pointer;
}
The python code:
pdf_filepath = os.path.join(folder,file_pdf)
demo_df = pd.DataFrame(np.random.random((10,3)), columns = ("col 1", "col 2", "col 3"))
table=demo_df.to_html(classes='mystyle')
html_string = f'''
<html>
<head><title>HTML Pandas Dataframe with CSS</title></head>
<link rel="stylesheet" type="text/css" href="df_style.css"/>
<body>
{table}
</body>
</html>
'''
HTML(string=html_string).write_pdf(pdf_filepath, stylesheets=["df_style.css"])
This is a solution with an intermediate pdf file.
The table is pretty printed with some minimal css.
The pdf conversion is done with weasyprint. You need to pip install weasyprint.
# Create a pandas dataframe with demo data:
import pandas as pd
demodata_csv = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'
df = pd.read_csv(demodata_csv)
# Pretty print the dataframe as an html table to a file
intermediate_html = '/tmp/intermediate.html'
to_html_pretty(df,intermediate_html,'Iris Data')
# if you do not want pretty printing, just use pandas:
# df.to_html(intermediate_html)
# Convert the html file to a pdf file using weasyprint
import weasyprint
out_pdf= '/tmp/demo.pdf'
weasyprint.HTML(intermediate_html).write_pdf(out_pdf)
# This is the table pretty printer used above:
def to_html_pretty(df, filename='/tmp/out.html', title=''):
'''
Write an entire dataframe to an HTML file
with nice formatting.
Thanks to #stackoverflowuser2010 for the
pretty printer see https://stackoverflow.com/a/47723330/362951
'''
ht = ''
if title != '':
ht += '<h2> %s </h2>\n' % title
ht += df.to_html(classes='wide', escape=False)
with open(filename, 'w') as f:
f.write(HTML_TEMPLATE1 + ht + HTML_TEMPLATE2)
HTML_TEMPLATE1 = '''
<html>
<head>
<style>
h2 {
text-align: center;
font-family: Helvetica, Arial, sans-serif;
}
table {
margin-left: auto;
margin-right: auto;
}
table, th, td {
border: 1px solid black;
border-collapse: collapse;
}
th, td {
padding: 5px;
text-align: center;
font-family: Helvetica, Arial, sans-serif;
font-size: 90%;
}
table tbody tr:hover {
background-color: #dddddd;
}
.wide {
width: 90%;
}
</style>
</head>
<body>
'''
HTML_TEMPLATE2 = '''
</body>
</html>
'''
Thanks to #stackoverflowuser2010 for the pretty printer, see stackoverflowuser2010's answer https://stackoverflow.com/a/47723330/362951
I did not use pdfkit, because I had some problems with it on a headless machine. But weasyprint is great.
when using Matplotlib, here's how to get a prettier table with alternating colors for the rows, etc. as well as to optionally paginate the PDF:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
def _draw_as_table(df, pagesize):
alternating_colors = [['white'] * len(df.columns), ['lightgray'] * len(df.columns)] * len(df)
alternating_colors = alternating_colors[:len(df)]
fig, ax = plt.subplots(figsize=pagesize)
ax.axis('tight')
ax.axis('off')
the_table = ax.table(cellText=df.values,
rowLabels=df.index,
colLabels=df.columns,
rowColours=['lightblue']*len(df),
colColours=['lightblue']*len(df.columns),
cellColours=alternating_colors,
loc='center')
return fig
def dataframe_to_pdf(df, filename, numpages=(1, 1), pagesize=(11, 8.5)):
with PdfPages(filename) as pdf:
nh, nv = numpages
rows_per_page = len(df) // nh
cols_per_page = len(df.columns) // nv
for i in range(0, nh):
for j in range(0, nv):
page = df.iloc[(i*rows_per_page):min((i+1)*rows_per_page, len(df)),
(j*cols_per_page):min((j+1)*cols_per_page, len(df.columns))]
fig = _draw_as_table(page, pagesize)
if nh > 1 or nv > 1:
# Add a part/page number at bottom-center of page
fig.text(0.5, 0.5/pagesize[0],
"Part-{}x{}: Page-{}".format(i+1, j+1, i*nv + j + 1),
ha='center', fontsize=8)
pdf.savefig(fig, bbox_inches='tight')
plt.close()
Use it as follows:
dataframe_to_pdf(df, 'test_1.pdf')
dataframe_to_pdf(df, 'test_6.pdf', numpages=(3, 2))
Explanation of the code is here:
https://levelup.gitconnected.com/how-to-write-a-pandas-dataframe-as-a-pdf-5cdf7d525488