excel pickling with openpyxl - python

I have written this code, but on the line _ = ws.cell(column == get_column_letter(cll[0][0]), row == (cll[0][1]), value == cll[1]) it returns an error that I do not understand!
Here's the code:
from openpyxl import Workbook as Wb
from openpyxl.utils import get_column_letter
import dill
def unpickle_exl(file_name, unpkld_file_name):
"""
unpickles a pickled excel file.
for file_name (current file name) suffix the name with .dll
likewise, for unpkld_file_name (file name to be made after unpickling) suffix your name with .xlsx
failiure to do so will result in an error.
please do not leave the second argument empty: please enter '' instead
"""
if file_name[len(file_name)-4:] != '.dll':
raise SyntaxError("file_name does not end with suffix .dll")
if unpkld_file_name != '' and unpkld_file_name[len(unpkld_file_name)-5:] != '.xlsx':
raise SyntaxError("unpkld_file_name does not end with suffix .xlsx")
if unpkld_file_name == '':
unpkld_file_name == unpkld_file_name.replace(".dll", ".xlsx")
try:
with open(file_name, 'rb') as d:
pkld_sprdsht = dill.load(d)
except OSError:
raise ReferenceError("File " + str(filename) + "does not exist.")
print(pkld_sprdsht)
wb = Wb()
for obj in pkld_sprdsht:
ws = wb.create_sheet()
for sht in obj:
for cll in sht:
_ = ws.cell(column == get_column_letter(cll[0][0]), row == (cll[0][1]), value == cll[1])
wb.save(filename = unpkld_file_name)
def test():
unpickle_exl('xlsx_to_dll test sprdsht.dll', 'xlsx_to_dll test sprdsht_copy.xlsx')
I'm trying to save the cells one by one(or maybe everything at once) but I don't really understand the _ = ... bit.
NOTE:
If you need it, I can add the pickling code if it helps.

Related

How to insert a data frame as an object attribute

This is most likely a pretty basic question, but I am still learning about classes/objects/constructors/etc. and I am trying to apply some of these concepts to my current workflow.
I am trying to create a class that automatically saves my data frame as a CSV or xlsx file, depending on what I specify, to a given folder. However, I don't believe that I am correctly passing my data frame as an object attribute. This is my code as it stands:
award_date_change = merged_df.loc[merged_df['award_date_change'] == 'yes'] #this is my data frame
class uploading_to_GC:
def __init__(self, file_name, file_type, already_exists): #constructor where I want to pass my data frame, file type to be saved to, and specifying if the file already exists in my folder
self.file_name = file_name
self.file_type = file_type
self.already_exists = already_exists
def print_file_name(self):
self.file_name.head(5)
def private_workspace(self):
commonPath = os.path.expanduser(r"~\path")
GCdocs = commonPath + '384593683' + '\\'
path = GCdocs + "" + file_name
if len(self.file_name) != 0 and self.already_exists == True: #if a file already exists in Gfolder
if self.file_type == "csv": #for csv files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_csv(path, index = False)
print("csv file is updated to private workspace in GCdocs")
elif self.file_type == "xlsx": #for xlsx files
GC_old = pd.read_csv(path)
GC_new = GC_old.append(self.file_name, ignore_index=True)
GC_new.to_excel(path, index = False)
print("excel file is updated to private workspace in GCdocs")
else:
print("unrecognized file type")
elif len(self.file_name) != 0 and self.already_exists == False: #if a file does FOLDER already exist in folder
if self.file_type == "csv":
self.file_name.to_csv(path,index=False)
if self.file_type == "xlsx":
self.file_name.to_excel(path,index=False)
else:
print("unrecognized file type")
else:
print("there is no data to upload")
award_date_change = uploading_to_GC(award_date_change,"csv", False)
award_date_change.private_workspace
I am aware that I don't need to use a class to do this, but I wanted to challenge myself to start using classes more often. Any help would be appreciated
You can pass and store a df in a Class as a data member very simply:
class Foo:
def __init__(df: pd.DataFrame):
self.df = df
# or, if you want to be sure you don't modify the original df
self.df = df.copy()
df = pd.DataFrame()
foo_obj = Foo(df)
Edit: the : pd.DataFrame is for type-hinting. This does not affect the actual code, but is merely useful to the reader that we are expecting a pd.DataFrame as input. Good IDEs will also give you an error if you don't pass a DataFrame.

with statement python __enter__ attribute error

This :
def add_to_excel(list_to_save, file_to_save_in):
my_file = dir_path + '\\' + file_to_save_in
with openpyxl.load_workbook(filename=my_file) as links_excel:
sheet = links_excel['Sheet1']
for i in list_to_save:
sheet.append(i)
links_excel.save(filename)
return
returns this:
3 my_file = dir_path + '\\' + file_to_save_in
----> 4 with openpyxl.load_workbook(filename=my_file) as links_excel:
5 sheet = links_excel['Sheet1']
6 for i in list_to_save:
AttributeError: __enter__
Tried this:
You're not using with statement and there's no close() statement so if this is not the first time you're running the code, it's likely that you haven't closed the file properly and it is still sitting in the memory and prevents access.
Edit:
Apparently closing the excel fixes it, and the with statement is not needed.
links_excel.close()
def add_to_excel(list_to_save, file_to_save_in):
my_file = os.path.join(dir_path, file_to_save_in)
links_excel=openpyxl.load_workbook(filename=my_file)
sheet = links_excel['Sheet1']
for i in list_to_save:
sheet.append(i)
links_excel.save(my_file)
links_excel.close()
from openpyxl documentation
Read an existing workbook:
from openpyxl import load_workbook
wb = load_workbook(filename = 'empty_book.xlsx')
sheet_ranges = wb['range names']
print(sheet_ranges['D18'].value)
This is an example on how to use the load_workbook method, so you don't need to use that with statement. Just use assignment.
def add_to_excel(list_to_save, file_to_save_in):
my_file = dir_path + '\\' + file_to_save_in
links_excel = openpyxl.load_workbook(filename=my_file)
sheet = links_excel['Sheet1']
for i in list_to_save:
sheet.append(i)
links_excel.save(filename)
links_excel.close()
return

Python - match string to csv value, then extract adjacent column

I'm very green when it comes to Python, so please forgive my disgusting formatting or poor optimization.
I'm trying to write a script to sort files into new folders based on their name.
In order to match their name to the correct new location, I have a csv file with two columns; the first is part of the name of the file, and the second is the correct folder it belongs in.
So far I have everything written to extract the parts of the file names I need, but now I'm stuck as to how I can match the strings I have to a value in the csv, and then extract the adjacent column.
This is what I have so far:
import os
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
def findDemoName(fileName):
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory):
sortingCSV = openCSV(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
for filename in os.listdir(srcDir):
name = findDemoName(filename)
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv used for sorting>'
inputDirectory = '<where the files are located>'
outputDirectory = '<where I want to move the files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory)
Right now it just prints the extracted portion of the file name and prints it so I could make sure it was doing what I wanted.
So my next steps are
1. Match the extracted portion of the file name to a matching value in the first column of the csv
2. Take the value adjacent to the match and use it to complete the destination path for the file to be moved to
I found this thread match names in csv file to filename in folder, but I don't understand where in the answer the csv is being matched to.
If I need to clear up some points let me know and I will.
Thank you in advance for reading :)
EDIT:
I've tried to stumble my way through this, and here's what I have so far:
import os, shutil
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
"""def createReader(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
return reader"""
def extractDemoName(fileName):
originalName = fileName
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory, prefix, suffix):
reader = openCSV(sortingFile)
#reader = createReader(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
column1 = 'DemographicName'
column2 = 'DemographicTypeName'
folder = ''
for filename in os.listdir(srcDir):
name = extractDemoName(filename)
for row in reader:
if row(column1) == name:
folder = row(column2)
destination = destDir + folder
file = prefix + name + suffix
shutil.copy(file, destination)
print('Moved ' + file + ' to ' + destination)
#else reader.next()
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv file>'
inputDirectory = '<source path>'
outputDirectory = '<destination path>'
filePrefix = '<beginning text of files>'
fileSuffix = '<ending text of files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
But now I'm receiving the following error instead:
Traceback (most recent call last):
File "script.py", line 63, in <module>
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
File "script.py", line 38, in moveFiles
if row(column1) == name:
TypeError: 'collections.OrderedDict' object is not callable
There is the problem (line 38)
if row(column1) == name:
it should be
if row[column1] == name:
I haven't checked any other logic in the script :)
This script reads the files from the directory you pass in method move_files's from_dir.
It checks if the file in the from_dir exists in the csv_file and if it does, it gets the location and moves it to that directory.
import os
import csv
import shutil
def get_file_sorter_dict(csv_file):
return dict(list(csv.reader(open(csv_file))))
def move_files(csv_file, from_dir, to_dir):
file_sorter_dict = get_file_sorter_dict(csv_file)
for filename in os.listdir(from_dir):
if file_sorter_dict.get(filename):
# you can use the location to move the file from csv_file
# move_to = file_sorter_dict.get(filename)
# shutil.move(filename, move_to)
# or you can use to_dir to move the file.
shutil.move(filename, to_dir)
if __name__ == "__main__":
move_files('files_sorter.csv', '.', '../')
The csv I am using looks like:
name, location
"foo.txt","../"
"baz.txt","../"

iterate over multiple files in my directory

Currently I am grabbing a excel file from a folder with Python just fine; in the below code.. and pushing this to a web form via selenium.
However, I am trying to modify this to continue to go through a directory over multiple files. (there will be many excel files in my 'directory' or 'folder').
main.py
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
if __name__ == "__main__":
try:
#Instantiates FindPendingRecords then gets records to process
PENDING_RECORDS = FindPendingRecords().get_excel_data()
#Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
#Configures Driver for vital
VITAL_ENTRY = VitalEntry()
#Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
#Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
print("All done, Bill")
except Exception as exc:
print(exc)
config.py
FILE_LOCATION = r"C:\Zip\2019.02.12 Data Docs.zip"
UNZIP_LOCATION = r"C:\Zip\Pending"
VITAL_URL = 'http://boringdatabasewebsite:8080/Horrible'
HEADLESS = False
PROCESSORS = 4
MAPPING_DOC = ".//map/mapping.xlsx"
find_pending_records.py
"""Module used to find records that need to be inserted into Horrible website"""
from zipfile import ZipFile
import math
import pandas
import config
class FindPendingRecords:
"""Class used to find records that need to be inserted into Site"""
#classmethod
def find_file(cls):
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
def get_excel_data(self):
"""Places excel data into pandas dataframe"""
excel_data = pandas.read_excel(self.find_file())
columns = pandas.DataFrame(columns=excel_data.columns.tolist())
excel_data = pandas.concat([excel_data, columns])
excel_data.columns = excel_data.columns.str.strip()
excel_data.columns = excel_data.columns.str.replace("/", "_")
excel_data.columns = excel_data.columns.str.replace(" ", "_")
num_valid_records = 0
for row in excel_data.itertuples():
person = row.PERSON
if person in ("", " ", None) or math.isnan(mrn):
print(f"Invalid record: {row}")
excel_data = excel_data.drop(excel_data.index[row.Index])
else:
num_valid_records += 1
print(f"Processing #{num_valid_records} records")
return self.clean_data_frame(excel_data)
def clean_data_frame(self, data_frame):
"""Cleans up dataframes"""
for col in data_frame.columns:
if "date" in col.lower():
data_frame[col] = pandas.to_datetime(data_frame[col],
errors='coerce', infer_datetime_format=True)
data_frame[col] = data_frame[col].dt.date
data_frame['PERSON'] = data_frame['PERSON'].astype(int).astype(str)
return data_frame
def get_mapping_data(self):
map_data = pandas.read_excel(config.MAPPING_DOC, sheet_name='main')
columns = pandas.DataFrame(columns=map_data.columns.tolist())
return pandas.concat([map_data, columns])
One way is as below (pseudocode)
class FindPendingRecords:
#classmethod
def find_file(cls):
return ["file1", "file2", "file3"]
def __init__(self):
self.files = self.find_file()
def get_excel_data(self):
for excel_data in self.files:
# process your excel_data
yield excel_data
Your main should be
if __name__ == "__main__":
try:
for PENDING_RECORDS in FindPendingRecords().get_excel_data():
# Do operations on PENDING_RECORDS
print (PENDING_RECORDS)
print("All done, Bill")
except Exception as exc:
print(exc)
Your find_file method will be
#classmethod
def find_file(cls):
all_files = list()
""""Finds the excel file to process"""
archive = ZipFile(config.FILE_LOCATION)
for file in archive.filelist:
if file.filename.__contains__('Horrible Data Log '):
all_files.append(archive.extract(file.filename, config.UNZIP_LOCATION))
return all_files

Merge two excel files with multiple sheet without losing formatting

I want to merge multiple excel files with multiple sheets respectively using python. I do not want to lose any formatting from the sheets. It should copy all sheets and just create a single excel file.
I'm able to merge only the first sheet and also all formatting is lost.
This is my code:
import os
import os.path
import xlrd
import xlsxwriter
file_name = input("merge")
merged_file_name = file_name + ".xls"
dest_book = xlsxwriter.Workbook('m.xls')
dest_sheet_1 = dest_book.add_worksheet()
dest_row = 1
temp = 0
path = input("C:\\test")
out = os.path.isdir("")
print(out)
print("File path: " + path)
for root,dirs, files in os.walk("C:\\test"):
for xlsfile in files:
print ("File in mentioned folder is: " + xlsfile)
temp_book = xlrd.open_workbook(os.path.join(root,xlsfile))
temp_sheet = temp_book.sheet_by_index(0)
if temp == 0:
for col_index in range(temp_sheet.ncols):
str = temp_sheet.cell_value(0, col_index)
dest_sheet_1.write(0, col_index, str)
temp = temp + 1
for row_index in range(1, temp_sheet.nrows):
for col_index in range(temp_sheet.ncols):
str = temp_sheet.cell_value(row_index, col_index)
dest_sheet_1.write(dest_row, col_index, str)
dest_row = dest_row + 1
dest_book.close()
book = xlrd.open_workbook("m.xls")
sheet = book.sheet_by_index(0)
print ("number of rows in destination file are: "), sheet.nrows
print ("number of columns in destination file are: "), sheet.ncols
Since you require Excel specific needs like formatting, consider directly interfacing to the Excel object library with a COM interface. Of course this assumes you have Excel installed on machine. For Windows, Python can run COM with the win32com library and this connects beyond Excel but to most Windows apps and objects including Notepad, Paint, even ADODB.
Essentially, this mirrors VBA (which does a similar interface to the Excel object library) using Workbooks.Add, Sheets.Add, Range.Copy, and other methods. All other APIs such as xlrd and xlwriter do not directly use Excel methods and hence why you lose formatting even graphics but not data.
import os
import win32com.client as win32
path = input("C:\\test")
file_name = input("merge")
merged_file_name = file_name + ".xlsx"
try:
# INITIALIZE EXCEL COM APP
xlapp = win32.gencache.EnsureDispatch('Excel.Application')
# ASSIGN CONSTANTS
xlPasteValues = -4163; lPasteFormats = -4122; xlWorkbookDefault = 51
# CREATE NEW WOKRBOOK (PROMPTS IF EXISTS)
new_wb = xlapp.Workbooks.Add()
new_wb.SaveAs(Filename='MasterMerge.xlsx', FileFormat=xlWorkbookDefault)
# LOOP THROUGH WORKBOOKS
xl_files = [f for f in os.listdir(path) if f.endswith('.xls') or f.endswith('.xlsx')]
for wb in xl_files:
xlwb = xlapp.Workbooks.Open(os.path.join(path, wb))
# LOOP THROUGH EVERY WORKSHEET, COPYING TO NEW WORKSHEET
for xlsh in xlwb.Worksheets:
new_sh = new_wb.Worksheets.Add()
new_sh.Name = xlsh.Name
new_wb.Save()
new_sh.Move(After=new_wb.Worksheets(new_wb.Worksheets.Count))
xlsh.Cells.Copy(new_sh.Cells)
new_sh = None
xlwb.Close(False)
xlwb = None
# REMOVNIG DEFAULT SHEET AND LAUNCHING TO SCREEN
new_wb.Worksheets('Sheet1').Delete()
new_wb.Save()
xlapp.Visible = True
except Exception as e:
print(e)
finally:
# RELEASE RESOURCES
xlsh = None; new_sh = None;
xlwb = None; new_wb = None; xlapp = None
Regarding to the error faced by Sachin Ingle,
(-2147352567, 'Exception occurred.', (0, 'Microsoft Excel', 'That name is already taken. Try a different one.', 'xlmain11.chm', 0, -2146827284), None) facing this error
It's probably because you have created a file of same name before. Try create it with different name.
The answer by Parfait on
new_wb.SaveAs(Filename='MasterMerge.xlsx', FileFormat=xlWorkbookDefault)
will make the file named "MasterMerge.xlsx" and probably you have created the file already.
And btw can add in xlapp.Quit() at finally: block to solve the in-use problem
I done some changes on Parfait answer (Thanks mate)
def merge_excel_files(filepath_list,filename,delete_original_files=False):
import os, errno
import win32com.client as win32
try:
# INITIALIZE EXCEL COM APP
xlapp = win32.gencache.EnsureDispatch('Excel.Application')
# ASSIGN CONSTANTS
xlPasteValues = -4163; lPasteFormats = -4122; xlWorkbookDefault = 51
# CREATE NEW WOKRBOOK (PROMPTS IF EXISTS)
new_wb = xlapp.Workbooks.Add()
new_wb.SaveAs(Filename=filename, FileFormat=xlWorkbookDefault)
# Gain filename in a directory
# xl_files = [f for f in os.listdir(path) if f.endswith('.xls') or f.endswith('.xlsx')]
for wb in filepath_list:
xlwb = xlapp.Workbooks.Open(wb)
# LOOP THROUGH EVERY WORKSHEET, COPYING TO NEW WORKSHEET
for xlsh in xlwb.Worksheets:
new_sh = new_wb.Worksheets.Add()
new_sh.Name = xlsh.Name
new_wb.Save()
new_sh.Move(After=new_wb.Worksheets(new_wb.Worksheets.Count))
xlsh.Cells.Copy(new_sh.Cells)
new_sh = None
xlwb.Close(False)
xlwb = None
# REMOVNIG DEFAULT SHEET AND LAUNCHING TO SCREEN
new_wb.Worksheets('Sheet1').Delete()
new_wb.Save()
# xlapp.Visible = True
except Exception as e:
print(e)
finally:
# Close the Excel file since done writing
xlapp.Quit()
# RELEASE RESOURCES
xlsh = None; new_sh = None;
xlwb = None; new_wb = None; xlapp = None
# Delete the initial file
if delete_original_files:
for count,x in enumerate(filepath_list):
print(f"Deleting the {count+1}/{len(filepath_list)} original file(s)...")
try:
os.remove(x)
except OSError as e:
# No such file or directory
if e.errno != errno.ENOENT:
raise
else:
# If there's no exception
print(f"Deleted {x}")
## Merge Excel files into one workbook with keeping the sheets and styling/formatting
# => https://stackoverflow.com/questions/51986517/merge-two-excel-files-with-multiple-sheet-without-losing-formatting
# => https://stackoverflow.com/questions/44593705/how-to-copy-over-an-excel-sheet-to-another-workbook-in-python/44596301#44596301 [openpyxl (Can't keep formatting), pywin32, xlwings]
# => https://stackoverflow.com/questions/56687602/copy-excel-sheet-from-one-worksheet-to-another-in-python/56688138#56688138 [xlwings]
## Solve file in use problem with pywin32 solution from questions/51986517
# => https://stackoverflow.com/questions/6337595/python-win32-com-closing-excel-workbook/6338030
## Basic Python: Pythonic way to delete a files, running code if try statements were successful
# => https://stackoverflow.com/questions/10840533/most-pythonic-way-to-delete-a-file-which-may-not-exist
# => https://stackoverflow.com/questions/2792568/running-code-if-try-statements-were-successful-in-python
## Research on openpyxl copy_worksheet(); Conclusion: it can only copy and paste sheet within same workbook. =(
# => https://stackoverflow.com/questions/44593705/how-to-copy-over-an-excel-sheet-to-another-workbook-in-python/44596301
# => https://openpyxl.readthedocs.io/en/latest/tutorial.html?highlight=copy_worksheet#manipulating-a-workbook-in-memory

Categories