pandas: Writing to an existing excel file (xlsx) using to_excel - python

I have a simple use case for df.to_excel() that I'm struggling with. I want to write to a specific worksheet tab (let's call it "Data") of an existing XLSX workbook, which could be referenced by formulas and pivots on other tabs.
I've tried to modify ExcelWriter in two ways but both produce errors from openpyxl.
Read an existing sheet using get_sheet_by_name (This errors: "NotImplementedError: use 'iter_rows()' instead".)
Create a new sheet using create_sheet. (This errors:"ReadOnlyWorkbookException: Cannot create new sheet in a read-only workbook")
df=DataFrame()
from openpyxl.reader.excel import load_workbook
book = load_workbook('my_excel_file.xlsx', use_iterators=True) # Assume my_excel_file.xlsx contains a sheet called 'Data'
class temp_excel_writer(ExcelWriter): # I need this to inherit the other methods of ExcelWriter in io/parsers.py
def __init__(self, path, book):
self.book=book
test_sheet=self.book.create_sheet(title='Test') # This errors: ReadOnlyWorkbookException
self.use_xlsx = True
self.sheet_names=self.book.get_sheet_names()
self.actual_sheets=self.book.worksheets
self.sheets={}
for i,j in enumerate(self.sheet_names):
self.sheets[j] = (self.actual_sheets[i],1)
self.cur_sheet = None
self.path = save
my_temp_writer=temp_excel_writer('my_excel_file.xlsx', book)
df.to_excel(my_temp_writer, sheet_name='Data')
Any thoughts? Am I missing something obvious? I'm still in pandas 7.2

When you load your workbook with use_iterators=True, it then _set_optimized_read() on the Workbook object, which cause it to be loaded read-only.
Thus, with the following code :
from openpyxl.reader.excel import load_workbook
book = load_workbook('t.xlsx', use_iterators=False) # Assume t.xlsx contains ['Data', 'Feuil2', 'Feuil3']
print book.get_sheet_names()
class temp_excel_writer():
def __init__(self, path, book):
self.book=book
test_sheet=self.book.create_sheet(title='Test') # No exception here now
self.book.save(path)
self.use_xlsx = True
self.sheet_names=self.book.get_sheet_names()
print self.sheet_names
self.actual_sheets=self.book.worksheets
self.sheets={}
for i,j in enumerate(self.sheet_names):
self.sheets[j] = (self.actual_sheets[i],1)
self.cur_sheet = None
self.path = path # I had to modify this line also
my_temp_writer = temp_excel_writer('my_excel_file.xlsx', book)
It create a file named my_excel_file.xlsx and the following output :
['Data', 'Feuil2', 'Feuil3']
['Data', 'Feuil2', 'Feuil3', 'Test']
Hope it helps

Related

How to return calling object from mocked pandas function

I am attempting to write a test (using pytest-mock) for some code that uses pandas for I/O. Ideally instead of writing the contents to a file using pandas.to_excel() function, I would rather return the dataframe to my test function for comparison.
So my questions are:
Is what I want even possible
If so, how do I return the dataframe from the code under test to my test function
Here's my code so far.
mymodule.py
def my_module_func(input_file: Path, dbc: DBConnection) -> None:
db_data = dbc.get_from_db()
spreadsheet_data = pandas.read_excel(input_file, engine='openpyxl')
# Do some stuff to modify the spreadsheet_data dataframe
# Generate the path to the output file and write updated info to it
output_name = f"{input_file.stem}_Checked{input_file.suffix}"
output_file = input_file.parent.joinpath(output_name)
spreadsheet_data.to_excel(output_file, index=False)
test_mymodule.py
from mymodule import my_module_func
dummy_data = {'values': ['val1']}
#pytest.fixture()
def fake_file():
return Path("./mocked.xlsx")
def test_my_func(mocker, fake_file)
# Patch the db class function so we don't actually hit the db
mock_db_connector = mocker.patch('mymodule.DBConnection', autospec=True)
mock_db_connector.get_from_db.return_value = ["val1", "val2"]
# Patch the pandas read function for getting the data from the input file
mocker.patch('mymodule.pandas.read_excel', return_value=DataFrame(data=dummy_data))
# Patch the pandas to_excel function for writing data to file
# Ideally I'd like to tell this to return the dataframe instead of write it
output_patch = mocker.patch('mymodule.pandas.DataFrame.to_excel')
my_module_func(fake_file, mock_db_connector)
# End Goal
# pandas.testing.assert_frame_equal(actual, expected)

Modify Named Table in Excel File with Python openpyxl

I want to add columns to an existing table in an excel file.
Therefore I wan't to use python and the openpyxl library.
Right now I use a class when it is initialising, it is connecting to the file.
Afterwards I call the check_for_column function and when the column is not existing it should create it. And in the end of the script I save the file.
import os
from openpyxl import load_workbook
from openpyxl.worksheet.table import Table, TableColumn, range_boundaries
from openpyxl.utils.cell import get_column_letter
class ExcelHandler:
_wb_name = None
_table = None
_wb = None
def __init__(self):
self._wb_name = os.getenv('EXCEL_FULLPATH')
self._wb = load_workbook(filename=self._wb_name, keep_vba=True)
sheet = self._wb['DataInbox']
self._table = sheet.tables['WebPageForms']
return
def check_for_column(self, column_name):
if not column_name in self._table.column_names:
lst_ids = [my_object.id for my_object in self._table.tableColumns]
new_id = lst_ids[-1]+1
# change range of table
min_col, min_row, max_col, max_row = range_boundaries(
self._table.ref)
max_col += 1
mx = get_column_letter(max_col)
mn = get_column_letter(min_col)
self._table.ref = '{}{}:{}{}'.format(mn, min_row, mx, max_row)
# add column to table
tc = TableColumn(id=new_id, name=column_name)
self._table.tableColumns.append(tc)
return
def save_wb(self):
self._wb.save(self._wb_name)
return
The code runs fine as shown. Although when I then try to open the file with excel it gives me an alert saying:
We found a problem with some content in ’file.xlsm’. Do you want us to try to recover as much as we can? If you trust the source of this workbook, click Yes.
This is the repair result of excel when I press yes
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<recoveryLog xmlns="http://schemas.openxmlformats.org/spreadsheetml/2006/main"><logFileName>Repair Result to file.xml</logFileName><summary>Errors were detected in file ’*path*/file.xlsm’</summary><repairedRecords summary="Following is a list of repairs:"><repairedRecord>Repaired Records: Table from /xl/tables/table1.xml part (Table)</repairedRecord></repairedRecords></recoveryLog>
I would highly appreciate If anyone could help me
Ok, I found the problem why the excel file is corrupt, my bad.
when I create the column in the table, I also have to write the name in the respective cell:
def check_for_column(self, column_name):
***
# write name in cell of new column header
self._ws.cell(row=min_row, column=max_col).value = column_name
***
return
If I add this to the code, my table is modified correctly

Hand over settings in a class to a pd.read_csv() function

Hi i am pretty new to python. I developed the following class:
import pandas as pd
import os
class Class1:
def __init__(self, path, cols = None, settings = {"sep" : ";", "encoding" : "unicode_escape", "header" : "infer", "decimal" :"."
, "skiprows" : None, "names" : None, "skipfooter" : 0, "engine" : "python"} ):
self.raw = self._load_raw(path = path, s = settings, cols = cols)
def _load_raw(self, path, s, cols = None):
df = pd.read_csv(path, sep = s["sep"], encoding = s["encoding"], decimal = s["decimal"], skiprows = s["skiprows"], skipfooter = s["skipfooter"]
, engine = s["engine"], header = s["header"], names = s["names"], usecols = cols)
return df
Inside of the class is a function which reads a csv file into a pd.DataFrame. I am wondering if there is a smart way of developing the class without handing over such a setting dictionary to read the dataframe later on when creating an object. Lets suppose the csv file is much more easy and just need 1 argument e.g. "sep" and not all the other arguments, but then the class needs also to be able to read csv files which require more arguments. Is there a pythonic way to just hand over as many as required ?
for example 1 object just needs "sep", and another object neeeds all of the settings parameters defined in the docs of pd.read_csv for example, but they can be both created with the same class

Class that returns a transformed dataframe

I'm trying to create a class that takes the path and name of the CSV file, converts it to a dataframe, deletes some columns, converts another one to datetime, as in the code
import os
from pathlib import Path
import pandas as pd
import datetime
class Plans:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.Dataframe()
def get_dataframe(self):
os.chdir(self.path)
self.df = pd.read_csv(self.file, encoding="latin-1", low_memory=False, sep=';')
if 'data' in df.columns:
self.tipo = 'sales'
self.df['data'] = pd.to_datetime(df['data'])
return clean_unused_data()
def clean_unused_data(self):
columns = ['id', 'docs', 'sequence','data_in','received', 'banc', 'return', 'status', 'return_cod',
'bank_account_return', 'id_transcript', 'id_tx','type_order']
for item in columns:
del self.df[item]
del columns[:]
return self.df
When I call an object of the class it gives an error with the clean_unused_data function
returns the following error:
__getattr__ raise AttributeError(f"module 'pandas' has no attribute '{name}'")
Also, I would like to do more dataframe transformations in the Plans class. but since this first one failed, I was a little lost.
Thanks for the help and I apologize for the lack of intimacy with python
I think the error refers to calling an attribute that does not exist in Pandas. From what I can see you wrote pd.DataFrame as pd.Dataframe. Notice the capitalization.
Try the following:
def __init__(self, file , path):
self.file = file
self.path = path
self.df = pd.DataFrame()
Probably one of the columns you are trying to delete is not actually in your file. You can handle the exception or remove this column label from your array.

nrows function in xlrd not returning the recent row values

I am a beginner in python learning. For one of my validations I need to return the no of rows written each time. So, I have written the code below but the nrows function is not returning the recent row's values.
import xlwt
import xlrd
Task1= xlwt.Workbook()
sheet1 = Task1.add_sheet("Task_details1")
Task1.save('C:\Users\HOME\Desktop\Task1.xls')
book = xlrd.open_workbook('C:\Users\HOME\Desktop\Task1.xls')
sheet = book.sheet_by_name("Task_details1")
def return_rows():
return book.sheet_by_index(0).nrows
class test(object):
def __init__(self):
for i in range(20):
sheet1.write(i,0,i)
Task1.save('C:\Users\HOME\Desktop\Task1.xls')
print return_rows()
test1=test()
Function definition version 1:
def return_rows():
book = xlrd.open_workbook('C:\Users\HOME\Desktop\Task1.xls')
sheet = book.sheet_by_name("Task_details1")
return book.sheet_by_index(0).nrows

Categories