I'm completely new to coding (it's just for fun and hopefully to save some time at work) and I've been trying to make my first lines of code working.
Specifically, I want my code to open a certain Excel workbook, find certain worksheets which are actually chartsheets (each one with only one chart in it) and print them as pdf/jpeg files in a specific folder. I went for the ExportAsFixedFormat, but I encountered the following error.
AttributeError: 'Chartsheet' object has no attribute 'ExportAsFixedFormat'
Could you please help me? Is there any way to print/save a Chartsheet?
I went through the Chartsheet Object's methods, but I couldn't find anything helpful. I'm sure I'm missing something.
Some info about my configuration:
Windows 10 Home x64
Excel for Microsoft 365 MSO (16.0.13628.20318) 64 bit
Python 3.8 32 bit
Pywin32 version 227
Below the chunk of code that I'm having problems with.
[Edit]: below the whole code I wrote, maybe the error is not where I think it is.
Thank you in advance and sorry for my broken English.
First of all, I've imported a ton of things, I'm aware I most probably need just half of them.
import plotly
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.cm as cm
import matplotlib.mlab as mlab
import win32com.client as win32
import openpyxl
import os, sys
import math
import openpyxl
from openpyxl import Workbook, load_workbook
from openpyxl import chart
from openpyxl import chartsheet
from openpyxl.chartsheet.publish import WebPublishItem, WebPublishItems
from openpyxl.drawing.spreadsheet_drawing import SpreadsheetDrawing
#from .drawings import find_images
from openpyxl.chartsheet import Chartsheet
import openpyxl.chart
import win32com.client
from pdf2image import convert_from_path
from pathlib import Path
import xlsxwriter
And here is the code I wrote:
path_filePy = Path(__file__).resolve()
current_folder = path_filePy.parent
image_folder_name = "Immages"
image_folder_path = os.path.join(current_folder, image_folder_name)
try:
os.mkdir(image_folder_path)
except OSError:
files = os.listdir(image_folder_path)
for f in files:
os.remove(image_folder_path + '\\'+ f)
folder_list = os.listdir(current_folder)
excel_list=[]
for l in folder_list:
if l.endswith('.xlsx'):
excel_list.append(l)
chartsheets_names=['Chartsheet1', 'Chartsheet2', 'Chartsheet3', 'Chartsheet4']
excel = win32.gencache.EnsureDispatch('Excel.Application')
for excelfile in excel_list:
wb = load_workbook(os.path.join(current_folder, excelfile))
for sheet in chartsheets_names:
ws=wb[sheet]
image_file_name = excelfile[:-5]+'_'+sheet+'.pdf'
image_file_path = os.path.join(image_folder_path,image_file_name)
ws.ExportAsFixedFormat(0, image_file_path)
convert_from_path(image_file_path, dpi=300, output_folder=image_folder_path,fmt='jpeg')
wb.Close()
I managed to get what I wanted in the end. Below is the code I'm using now, maybe it could be helpful to someone else too.
I think I was messing with code related to win32com and code related to openpxl.
Now I would like my Chartsheets to stretch all over the printing area prior to printing (I tried to set margins to zero, it does not work). I think I should use wb_sheet.PageSetup.ChartSize with the value FullPage, but I do not get how to assign it.
import os
import sys
from pathlib import Path
import win32com.client as w3c
from pdf2image import convert_from_path
# find the parent folder of the .py file
path_filePy = Path(__file__).resolve()
current_folder = path_filePy.parent
print(current_folder)
# create the destination folder or empty it if existing
image_folder_name = "Immages"
image_folder_path = os.path.join(current_folder, image_folder_name)
#print(image_folder_path)
try:
os.mkdir(image_folder_path)
except OSError:
files = os.listdir(image_folder_path)
for f in files:
os.remove(image_folder_path + '\\'+ f)
# list of file in the folder
folder_list = os.listdir(current_folder)
# list of only *.xlsx files
excel_list=[]
for l in folder_list:
if l.endswith('.xlsx'):
excel_list.append(l)
# listof sheets' names I want to print
chartsheets_names=['Sheet1', 'Sheet2', 'Sheet3', 'Sheet4']
o = w3c.Dispatch("Excel.Application")
o.Visible = False
# for each sheet names as in my list, in each xlsx file, it prints in both pdf and jpeg
for excel_file in excel_list:
try:
wb_path = os.path.join(os.path.abspath(current_folder), excel_file)
wb = o.Workbooks.Open(wb_path)
for wb_sheet in wb.Sheets:
if wb_sheet.Name in chartsheets_names:
path_to_pdf = os.path.join(os.path.abspath(image_folder_path), excel_file[:-5] + ' - ' + str(wb_sheet.Name) + '.pdf')
wb_sheet.SaveAs(path_to_pdf, FileFormat=57)
convert_from_path(
path_to_pdf, # the input pdf file
dpi=300,
output_folder=image_folder_path,
fmt='jpeg',
output_file=str(excel_file[:-5] + ' - ' + str(wb_sheet.Name)),
poppler_path = r"C:\where\your\poppler\bin folder is",
use_pdftocairo=False)
else: next
wb.Close(False)
except OSError:
next
o.Quit
`
See updated answer further down.
See further update below.
After pip install comtypes this works for me:
import os
import comtypes.client
SOURCE_DIR = r'C:\Users\xyz\SO-samples' # adjust to your needs
TARGET_DIR = r'C:\Users\xyz\SO-samples' # adjust to your needs
app = comtypes.client.CreateObject('Excel.Application')
app.Visible = False
infile = os.path.join(os.path.abspath(SOURCE_DIR), 'an-excel-file.xlsx')
outfile = os.path.join(os.path.abspath(TARGET_DIR), 'an-excel-file.pdf')
doc = app.Workbooks.Open(infile)
doc.ExportAsFixedFormat(0, outfile, 1, 0)
doc.Close()
app.Quit()
Updated answer - selectable sheets:
import os
import win32com.client
SOURCE_DIR = r'C:\Users\xyz\SO-samples' # adjust
TARGET_DIR = r'C:\Users\xyz\SO-samples' # adjust
wb_path = os.path.join(os.path.abspath(SOURCE_DIR), 'xyzzy.xlsx')
wb = o.Workbooks.Open(wb_path)
o = win32com.client.Dispatch("Excel.Application")
o.Visible = False
# print 1 sheet to 1 file
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy2.pdf')
ws_index_list = [2] # say you want to print this sheet
wb.WorkSheets(ws_index_list).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# print 2 sheets to 1 file
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-1-3.pdf')
ws_index_list = [1,3] # say you want to print these sheets
wb.WorkSheets(ws_index_list).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# print 3 sheets to 1 file each
ws_index_list = [1,2,3] # say you want to print these sheets
for ws_index in ws_index_list:
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-' + str(ws_index) + '.pdf')
wb.WorkSheets([ws_index]).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
# select sheet by name, print 1 sheet to 1 file
ws_sheet_name = 'named_sheet'
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-' + ws_sheet_name + '.pdf')
wb.WorkSheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
Further update - printing sheet names, select sheet by name:
import win32com.client as w3c
import os, sys
SOURCE_DIR = r'C:\Users\xyz\SO-samples'
TARGET_DIR = r'C:\Users\xyz\SO-samples'
wb_path = os.path.join(os.path.abspath(SOURCE_DIR), 'xyzzy.xlsx')
o = w3c.Dispatch("Excel.Application")
o.Visible = False
wb = o.Workbooks.Open(wb_path)
for wb_sheet in wb.Sheets:
print(wb_sheet.Name)
### this works
ws_sheet_name = [1,3]
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy' + '.pdf')
wb.Worksheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
### this works
ws_sheet_name = 'xyzzy'
path_to_pdf = os.path.join(os.path.abspath(TARGET_DIR), 'xyzzy-xyzzy' + '.pdf')
wb.Worksheets(ws_sheet_name).Select()
wb.ActiveSheet.ExportAsFixedFormat(0, path_to_pdf)
wb.Close()
Related
I have 3 excel files currently in my working directory. All 3 files has name that ends with "_Updated.xlsx". I wanted to transform the files such that all empty rows in each of the files get deleted. I have created function for it, but the only issue is I cannot save all transformed file using below code. Not sure what is wrong ? The reason for creating new file is I would like to save my raw files.
Python code
import openpyxl
import os
from openpyxl import load_workbook,Workbook
import glob
from pathlib import Path
Excel_file_path="/Excel"
for file in Path(Excel_file_path).glob('*_Updated.xlsx'):
wb=load_workbook(file)
wb_modified = False
for sheet in wb.worksheets:
max_row_in_sheet = sheet.max_row
max_col_in_sheet = sheet.max_column
sheet_modified = False
if max_row_in_sheet > 1:
first_nonempty_row = nonempty_row() # Function to find nonempty row
sheet_modified = del_rows_before(first_nonempty_row) #Function to delete nonempty row
wb_modified = wb_modified or sheet_modified
if wb_modified:
for workbook in workbooks:
for sheet in wb.worksheets:
new_wb = Workbook()
ws = new_wb.active
for row_data in sheet.iter_rows():
for row_cell in row_data:
ws[row_cell.coordinate].value = row_cell.value
new_wb.save("/Excel/"+sheet.title+"_Transformed.xlsx")
In case, if any one is still looking for answer to my above question. Below is the code that worked for me.
import openpyxl
import os
from openpyxl import load_workbook
import glob
from pathlib import Path
Excel_file_path="/Excel"
for file in Path(Excel_file_path).glob('*_Updated.xlsx'):
wb=load_workbook(file)
wb_modified = False
for sheet in wb.worksheets:
max_row_in_sheet = sheet.max_row
max_col_in_sheet = sheet.max_column
sheet_modified = False
if max_row_in_sheet > 1:
first_nonempty_row = get_first_nonempty_row() # Function to find nonempty row
sheet_modified = del_rows_before(first_nonempty_row) #Function to delete nonempty roW
file_name = os.path.basename(file)
wb.save("Excel/"+file_name[:-5]+"_Transformed.xlsx")
wb.close()
I get this error
TypeError: 'Workbook' object is not subscriptable
when i run this code
import xlsxwriter
from openpyxl import load_workbook
in_folder = r'xxx' #Input folder
out_folder = r'xxx' #Output folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
file_exist = True
str_file = os.path.join(in_folder, xlfile)
work_book = xlsxwriter.Workbook(filename=str_file)
work_sheet = work_book['test1'] #error above is thrown here
work_sheet.write_formula('C2', '=A2+B2') #Add formular but not sure of how to apply it to the entire column.
out_Path = os.path.join(out_folder,work_book)
Edit:
I managed to figure out the above and using this code:-
work_book = openpyxl.load_workbook(os.path.join(in_folder,xlfile))
work_sheet = work_book['test1']
However, the issue formulas still exists in the new code below:-
from openpyxl import load_workbook
in_folder = r'xxx' #Input folder
out_folder = r'xxx' #Output folder
if not os.path.exists(out_folder):
os.makedirs(out_folder)
file_exist = False
dir_list = os.listdir(in_folder)
for xlfile in dir_list:
if xlfile.endswith('.xlsx') or xlfile.endswith('.xls'):
str_file = xlfile
work_book = openpyxl.load_workbook(os.path.join(in_folder,str_file))
work_sheet = work_book['Sheet1']
row_count = work_sheet.max_row
for row in work_sheet.iter_rows(min_row=1, min_col=1, max_row=work_sheet.max_row):
print(row_count)
for i, cellObj in enumerate(work_sheet['U'], 2):
cellObj.value = f'=Q{row_count}-T{row_count}'
work_book.save(os.path.join(out_folder, xlfile))
Ideally, I would like to loop through a folder with .xlsx files, add a formular and apply it to the entire column (U). In this case, I would like to save the files(with the formula effected) in another folder(out_folder).
Documentation for xlsxwriter.Workbook shows
work_book.get_worksheet_by_name('test1')
Maybe openpyxl or other module could use ['test1']
I would like to only include certain sheets in an excel using the Filter() function.
This is my code so far:
import win32com.client as win32
from pathlib import Path
win32c = win32.constants
excel = win32.gencache.EnsureDispatch('Excel.Application')
wb = excel.Workbooks.Open("C:/Prueba/GOOG.xlsm")
def included(sheet_name):
l = ['Report_Data', 'Report_Main']
if sheet_name in l:
return True
wb.__Sheets__ = filter(included, [sheet.Name for sheet in wb.Sheets]) # wb.__Sheets__ doesn't work of course...
My guess is that I need to properly access the Sheets attribute from workbook object and then the filter setup should do it. I tried "Sheets" for instance, but doesn't seem to work (also does not throw an error...).
Any ideas?
For the filter process you're using, you don't need to open the workbook. You can load the file using openpyxl and get the sheet names.
Try this code:
from pathlib import Path
import openpyxl
wb = openpyxl.load_workbook('C:/Prueba/GOOG.xlsm')
print("All Sheets:", wb.sheetnames)
def included(sheet_name):
l = ['Report_Data', 'Report_Main']
if sheet_name in l:
return True
wb.__Sheets__ = filter(included, wb.sheetnames) # wb.__Sheets__ doesn't work of course..
print(list(wb.__Sheets__))
If you prefer to stay with Win32 and have Excel actually open, you can use this code:
from win32com.client import Dispatch
import win32com
import win32com.client as win32
excel = win32com.client.dynamic.Dispatch('Excel.Application')
excel.Visible = True
wb = excel.Workbooks.Open("C:/Prueba/GOOG.xlsm")
print("All Sheets:",[wb.Sheets(i+1).Name for i in range(wb.Sheets.Count)])
def included(sheet_name):
l = ['Report_Data', 'Report_Main']
if sheet_name in l:
return True
ShtList = filter(included, [wb.Sheets(i+1).Name for i in range(wb.Sheets.Count)])
print(list(ShtList))
excel.Quit()
Here is the complete code to delete the extra sheets and save the workbook as a new file.
from win32com.client import Dispatch
import win32com
import win32com.client as win32
from shutil import copyfile
excel = win32com.client.dynamic.Dispatch('Excel.Application')
excel.Visible = True
filename = "C:/Prueba/GOOG.xlsm"
filenamenew = "C:/Prueba/GOOG.New.xlsm"
copyfile(filename, filenamenew)
wb = excel.Workbooks.Open(filenamenew)
print("All Sheets:",[wb.Sheets(i+1).Name for i in range(wb.Sheets.Count)])
def remove(sheet_name):
l = ['Report_Data', 'Report_Main']
if not sheet_name in l:
return True
ShtList = list(filter(remove, [wb.Sheets(i+1).Name for i in range(wb.Sheets.Count)]))
print("DelLst:",ShtList)
excel.DisplayAlerts = False # new prompt for delete
for s in ShtList:
print("del", s)
wb.Worksheets(s).Delete()
wb.Save()
excel.DisplayAlerts = True
excel.Quit()
I have earlier come up with this question in here:
pypdf2-merging-pdf-pages-issue
Where I have now come a long way and can now create my PDF files from an Excel document via Pandas into PyPDF2.
As well as where I now have the number of pages that must be per. PDF.
However, my problem now is that my merged PDF files are now blank.
If I do a debug, then I can see that in my second loop, which contains the variable "paths" the right paths to my physical PDF files.
But that when they then come in through:
with path.open('rb') as pdf:
pdf_writer.append(pdf)
Then suddenly an extra "" enters the paths so that a path can be named c: \ users \ .... then suddenly it is called c: \ users \ ...
Do not know if this is what prevents the files from being opened and read correctly, and then merged into one PDF file.
Hope some can guide me as python for me is self taught.
Or in some other way can explain to me why I get created some merged PDF files that are suddenly blank on 3 pages.
My code is:
import datetime #Handle date
import pandas as pd #Handle data from Excel Sheet (Data analysis)
import PyPDF2 as pdf2 #Handle PDF read and merging
from pathlib import Path #Handle path
#Skip ERROR-message: Xref table not zero-indexed. ID numbers for objects will be corrected.
#import sys
#if not sys.warnoptions:
# import warnings
# warnings.simplefilter("ignore")
PDF_PATH = Path('C:/Users/TH/PDF/')
EXCEL_FILENAME = 'Resources/liste.xlsx'
def main():
today = datetime.date.today() # The date now
next_week = today.isocalendar()[1] + 1 # 0=Year, 1=week
resources = pd.read_excel(EXCEL_FILENAME, sheet_name='Ark1')
for row in resources.itertuples():
year = row.Aargang
paths = [
(PDF_PATH / row.Oevelse1).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse2).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse3).with_suffix('.pdf'),
]
pdf_writer = pdf2.PdfFileMerger()
for path in paths:
with path.open('rb') as pdf:
pdf_writer.append(pdf)
with open(f'Uge {next_week} - {year} Merged_doc.pdf', 'wb') as output:
pdf_writer.write(output)
if __name__ == '__main__':
main()
#anon01 Thx
And Thx/credit to Sirius3.
It's something about the PyPDF2, how to use it and some bugs with it.
So after edit the code to this it work.
import datetime #Handle date
import pandas as pd #Handle data from Excel Sheet (Data analysis)
from PyPDF2 import PdfFileMerger #Handle PDF read and merging
from pathlib import Path #Handle path
#Skip ERROR-message: Xref table not zero-indexed. ID numbers for objects will be corrected.
#import sys
#if not sys.warnoptions:
# import warnings
# warnings.simplefilter("ignore")
PDF_PATH = Path('C:/Users/TH/PDF')
EXCEL_FILENAME = 'Resources/liste.xlsx'
def main():
today = datetime.date.today() # The date now
next_week = today.isocalendar()[1] + 1 # 0=Year, 1=week
resources = pd.read_excel(EXCEL_FILENAME, sheet_name='Ark1')
for row in resources.itertuples():
year = row.Aargang
paths = [
(PDF_PATH / row.Oevelse1).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse2).with_suffix('.pdf'),
(PDF_PATH / row.Oevelse3).with_suffix('.pdf'),
]
pdf_merger = PdfFileMerger()
for path in paths:
pdf_merger.append(str(path))
with open(f'Uge {next_week} - {year} Merged_doc.pdf', 'wb') as output:
pdf_merger.write(output)
pdf_merger.close()
if __name__ == '__main__':
main()
I can open a password-protected Excel file with this:
import sys
import win32com.client
xlApp = win32com.client.Dispatch("Excel.Application")
print "Excel library version:", xlApp.Version
filename, password = sys.argv[1:3]
xlwb = xlApp.Workbooks.Open(filename, Password=password)
# xlwb = xlApp.Workbooks.Open(filename)
xlws = xlwb.Sheets(1) # counts from 1, not from 0
print xlws.Name
print xlws.Cells(1, 1) # that's A1
I'm not sure though how to transfer the information to a pandas dataframe. Do I need to read cells one by one and all, or is there a convenient method for this to happen?
Simple solution
import io
import pandas as pd
import msoffcrypto
passwd = 'xyz'
decrypted_workbook = io.BytesIO()
with open(i, 'rb') as file:
office_file = msoffcrypto.OfficeFile(file)
office_file.load_key(password=passwd)
office_file.decrypt(decrypted_workbook)
df = pd.read_excel(decrypted_workbook, sheet_name='abc')
pip install --user msoffcrypto-tool
Exporting all sheets of each excel from directories and sub-directories to seperate csv files
from glob import glob
PATH = "Active Cons data"
# Scaning all the excel files from directories and sub-directories
excel_files = [y for x in os.walk(PATH) for y in glob(os.path.join(x[0], '*.xlsx'))]
for i in excel_files:
print(str(i))
decrypted_workbook = io.BytesIO()
with open(i, 'rb') as file:
office_file = msoffcrypto.OfficeFile(file)
office_file.load_key(password=passwd)
office_file.decrypt(decrypted_workbook)
df = pd.read_excel(decrypted_workbook, sheet_name=None)
sheets_count = len(df.keys())
sheet_l = list(df.keys()) # list of sheet names
print(sheet_l)
for i in range(sheets_count):
sheet = sheet_l[i]
df = pd.read_excel(decrypted_workbook, sheet_name=sheet)
new_file = f"D:\\all_csv\\{sheet}.csv"
df.to_csv(new_file, index=False)
Assuming the starting cell is given as (StartRow, StartCol) and the ending cell is given as (EndRow, EndCol), I found the following worked for me:
# Get the content in the rectangular selection region
# content is a tuple of tuples
content = xlws.Range(xlws.Cells(StartRow, StartCol), xlws.Cells(EndRow, EndCol)).Value
# Transfer content to pandas dataframe
dataframe = pandas.DataFrame(list(content))
Note: Excel Cell B5 is given as row 5, col 2 in win32com. Also, we need list(...) to convert from tuple of tuples to list of tuples, since there is no pandas.DataFrame constructor for a tuple of tuples.
from David Hamann's site (all credits go to him)
https://davidhamann.de/2018/02/21/read-password-protected-excel-files-into-pandas-dataframe/
Use xlwings, opening the file will first launch the Excel application so you can enter the password.
import pandas as pd
import xlwings as xw
PATH = '/Users/me/Desktop/xlwings_sample.xlsx'
wb = xw.Book(PATH)
sheet = wb.sheets['sample']
df = sheet['A1:C4'].options(pd.DataFrame, index=False, header=True).value
df
Assuming that you can save the encrypted file back to disk using the win32com API (which I realize might defeat the purpose) you could then immediately call the top-level pandas function read_excel. You'll need to install some combination of xlrd (for Excel 2003), xlwt (also for 2003), and openpyxl (for Excel 2007) first though. Here is the documentation for reading in Excel files. Currently pandas does not provide support for using the win32com API to read Excel files. You're welcome to open up a GitHub issue if you'd like.
Based on the suggestion provided by #ikeoddy, this should put the pieces together:
How to open a password protected excel file using python?
# Import modules
import pandas as pd
import win32com.client
import os
import getpass
# Name file variables
file_path = r'your_file_path'
file_name = r'your_file_name.extension'
full_name = os.path.join(file_path, file_name)
# print(full_name)
Getting command-line password input in Python
# You are prompted to provide the password to open the file
xl_app = win32com.client.Dispatch('Excel.Application')
pwd = getpass.getpass('Enter file password: ')
Workbooks.Open Method (Excel)
xl_wb = xl_app.Workbooks.Open(full_name, False, True, None, pwd)
xl_app.Visible = False
xl_sh = xl_wb.Worksheets('your_sheet_name')
# Get last_row
row_num = 0
cell_val = ''
while cell_val != None:
row_num += 1
cell_val = xl_sh.Cells(row_num, 1).Value
# print(row_num, '|', cell_val, type(cell_val))
last_row = row_num - 1
# print(last_row)
# Get last_column
col_num = 0
cell_val = ''
while cell_val != None:
col_num += 1
cell_val = xl_sh.Cells(1, col_num).Value
# print(col_num, '|', cell_val, type(cell_val))
last_col = col_num - 1
# print(last_col)
ikeoddy's answer:
content = xl_sh.Range(xl_sh.Cells(1, 1), xl_sh.Cells(last_row, last_col)).Value
# list(content)
df = pd.DataFrame(list(content[1:]), columns=content[0])
df.head()
python win32 COM closing excel workbook
xl_wb.Close(False)
Adding to #Maurice answer to get all the cells in the sheet without having to specify the range
wb = xw.Book(PATH, password='somestring')
sheet = wb.sheets[0] #get first sheet
#sheet.used_range.address returns string of used range
df = sheet[sheet.used_range.address].options(pd.DataFrame, index=False, header=True).value