Editing several excel files before after iterating through path folder - python - python

I'm working on editing several Excel files at once and when it comes time to iterate through all of my folders, it is only capable of doing so for the first .xlsx file.
def sumOfCosts():
path=os.chdir(r'C:\Users\salvarai\AppData\Roaming\Python\Python310\site-packages\COSTBOMS')
for file in os.listdir(path):
if file.endswith(".xlsx"):
wb=load_workbook(filename=file)
sheet=wb.active()
sheet['08'].value="Total Cost="
char=get_column_letter(8)
sumchar=get_column_letter(16)
sheet[sumchar+"8"]=F"=SUM({char+'2'}:{char +'1000'})"
wb.save(file)
wb.close()
return

You have your return statement on the same indent level as the 'if file.endswith(".xlsx"):' so regardless of the statements in the if condition being executed the 'return' statement will be on the first 'file'.
To keep iterating thru the files move the 'return' to be level with the line 'for file in os.listdir(path)' so the function does not return until all the 'file's are processed.
def sumOfCosts():
path = os.chdir(r'C:\Users\salvarai\AppData\Roaming\Python\Python310\site-packages\COSTBOMS')
for file in os.listdir(path):
if file.endswith(".xlsx"):
wb = load_workbook(filename=file)
sheet = wb.active()
sheet['08'].value = "Total Cost ="
char = get_column_letter(8)
sumchar = get_column_letter(16)
sheet[sumchar + "8"] = F"=SUM({char+'2'}:{char +'1000'})"
wb.save(file)
wb.close()
return # <--- The return should be at this indent level

Related

Searching excel files for string in file through multiple folder directories not working

I have this code where I am trying to search a Directory and Sub Directories for a specified string within .xls and .xlsx files and return the file name for now. When I run this - I get a return of each file directory path as text for the files ending in .xls and .xlsx and the search string parameter I use under those same returned results. The code is not isolating the files with the string - rather, just returning the file path as text for all results and adding my string parameter to search for under that. What could be happening here? and is it possible to pass a list here and copy the discovered files to a folder? That is where I am trying to get with this in the end. Thank you.
import os
import openpyxl
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".xls"):
content = ""
with openpyxl.load_workbook(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".xlsx"):
with openpyxl.load_workbook(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
else:
return ""
supportedTypes = [".xls", ".xlsx"]
print(findFiles(strings=["55413354"], dir="C:/Users/User/", subDirs=True, fileContent=True, fileExtensions=False))
Expected output sample - reflects a find for string '55413354` - as in, that string was located in below file name only out of 3 files.
Excel File Name 123
Actual output - Returns everything - no filter is happening, and includes my search string under the file name.
path/Excel File Name 123
55413354
path/Excel File Name 321
55413354
path/Excel File Name 111
55413354

Rename XLS Files Using a Cell Value - Removing White Spaces and Special Characters

Situation:
I'm attempting to rename XLS files in a directory using a specific cell value from each file (i.e. Cell A4 contains "Name1", use A4 to create Name1.xls). There's a script I found that will work for my purposes.
Problem I'm trying to solve:
Every cell I'm attempting to use as the filename has spaces and special characters. Ideally, I'd like to remove all the special characters and white spaces, and use that as the value to name each file. I'm not very familiar with regex so I'm not sure if I should be modifying the fileNameCheck = re.compile('[^\w,\s-]') part of the code, or modify first if not block...
See below code:
# Import required modules
import openpyxl
import os
import re
import shutil
# File path
filePath = 'C:\\Users\name\Documents\Python\folder'
# Cell containing new file name
cellForFileName = 'A3'
# Check to see if the file path exists
if os.path.exists(filePath):
# Change the current working directory
os.chdir(filePath)
# Check if there are any files in the chosen directory
if len(os.listdir(filePath)) == 0:
print('There are no files to rename')
else:
# Renamed file count
filesRenamed = 0
# Process the files at the path
for filename in os.listdir(filePath):
# Check if the file is an Excel file, excluding temp files
if filename.endswith('.xls.xlsx') and not filename.startswith('~'):
try:
# Open the file and find the first sheet
workbook = openpyxl.load_workbook(filename)
worksheet = workbook.worksheets[0]
# Check if there is a value in the cell for the new file name
if worksheet[cellForFileName].value is not None:
# Check to see if the cell value is valid for a file name
fileNameCheck = re.compile('[^\w,\s-]')
if not fileNameCheck.search(worksheet[cellForFileName].value):
# Construct the new file name
newFileName = worksheet[cellForFileName].value + '.xlsx'
# Close the workbook
workbook.close()
# Rename the file
shutil.move(filename, newFileName)
# Output confirmation message
print('The file "' + filename + '" has been renamed to "'
+ newFileName + '".')
# Increment the count
filesRenamed += 1
else:
# Display a message saying the file could not be renamed
print('The file "' + filename + '" could not be renamed.')
# Close the workbook
workbook.close()
else:
# Display a message saying the file could not be renamed
print('The file "' + filename + '" could not be renamed.')
# Close the workbook
workbook.close()
except PermissionError as e:
# Display a message saying the file could not be renamed
print('The file "' + filename + '" could not be renamed.')
# Display a message regarding the number of files renamed
if filesRenamed == 1:
print(str(filesRenamed) + ' file has been renamed.')
else:
print(str(filesRenamed) + ' files have been renamed.')
else:
# Display a message stating that the file path does not exist
print('File path does not exist.')
Thanks in advance for any help, advice, tips you can provide!
I think filename.endswith('.xls.xlsx') would not work the way is expected, following the documentation of str.endswith you may use a tuple (.endswith(('.xls','.xlsx')). ) to match both .xls and .xlsx, furthermore, if you are working with both types of files is better to know the original extension and match that suffix during rename operation since they are interpreted in distinct ways.
... information [...] stored is vastly different for both XLS and XLSX formats. XLS is based on BIFF (Binary Interchange File Format) and as such, the information is directly stored to a binary format. On the other hand, XLSX is based on the Office Open XML format, a file format that was derived from XML... [1]
You may use _, extension = os.path.splitext(filename) to get only the extension part to use later on the rename operation.
To remove special characters and spaces, you may use re.sub("[^a-zA-Z0-9]", "", nameCell). If the string after the : is allowed to contain only special characters and spaces, make sure to test for an empty string before writing the file name.
...
...
# Process the files at the path
for filename in os.listdir(filePath):
# get extension to use later on file rename
_, extension = os.path.splitext(filename)
if filename.endswith(('.xls','.xlsx')) and not filename.startswith('~'):
try:
workbook = openpyxl.load_workbook(filename)
worksheet = workbook.worksheets[0]
# get the text after the ":"
nameCell = re.search(":(.+)", worksheet[cellForFileName].value).group(1)
# or use str.split(":")[1], make sure the range exists
workbook.close()
if nameCell is not None:
# remove special characters and spaces
clearName = re.sub("[^a-zA-Z0-9]", "", nameCell)
newFileName = clearName + extension
shutil.move(filename, newFileName)
print('The file "' + filename + '" has been renamed to "'
+ newFileName + '".')
filesRenamed += 1
else:
print('The file "' + filename + '" could not be renamed.')
except PermissionError as e:
...
...
...

grab next .zip file in folder (iterate through zip directory)

Below is my most recent attempt; but alas, I print 'current_file' and it's always the same (first) .zip file in my directory?
Why/how can I iterate this to get to the next file in my zip directory?
my DIRECTORY_LOCATION has 4 zip files in it.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
return archive.extract(file.filename, config.UNZIP_LOCATION)
return FileNotFoundError
find_file usage:
excel_data = pandas.read_excel(self.find_file())
Update:
I just tried changing return to yield at:
yield archive.extract(file.filename, config.UNZIP_LOCATION)
and now getting the below error at my find_file line.
ValueError: Invalid file path or buffer object type: <class 'generator'>
then I alter with the generator obj as suggested in comments; i.e.:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
and now getting this error:
generator = self.find_file(); excel_data = pandas.read_excel(generator())
TypeError: 'generator' object is not callable
Here is my /main.py if helpful
"""Start Point"""
from data.find_pending_records import FindPendingRecords
from vital.vital_entry import VitalEntry
import sys
import os
import config
import datetime
# from csv import DictWriter
if __name__ == "__main__":
try:
for file in os.listdir(config.DIRECTORY_LOCATION):
if 'VCCS' in file:
PENDING_RECORDS = FindPendingRecords().get_excel_data()
# Do operations on PENDING_RECORDS
# Reads excel to map data from excel to vital
MAP_DATA = FindPendingRecords().get_mapping_data()
# Configures Driver
VITAL_ENTRY = VitalEntry()
# Start chrome and navigate to vital website
VITAL_ENTRY.instantiate_chrome()
# Begin processing Records
VITAL_ENTRY.process_records(PENDING_RECORDS, MAP_DATA)
except:
print("exception occured")
raise
It is not tested.
def find_file(cls):
listOfFiles = os.listdir(config.DIRECTORY_LOCATION)
total_files = 0
for entry in listOfFiles:
total_files += 1
# if fnmatch.fnmatch(entry, pattern):
current_file = entry
print (current_file)
""""Finds the excel file to process"""
archive = ZipFile(config.DIRECTORY_LOCATION + "/" + current_file)
for file in archive.filelist:
if file.filename.__contains__('Contact Frog'):
yield archive.extract(file.filename, config.UNZIP_LOCATION)
This is just your function rewritten with yield instead of return.
I think it should be used in the following way:
for extracted_archive in self.find_file():
excel_data = pandas.read_excel(extracted_archive)
#do whatever you want to do with excel_data here
self.find_file() is a generator, should be used like an iterator (read this answer for more details).
Try to integrate the previous loop in your main script. Each iteration of the loop, it will read a different file in excel_data, so in the body of the loop you should also do whatever you need to do with the data.
Not sure what you mean by:
just one each time the script is executed
Even with yield, if you execute the script multiple times, you will always start from the beginning (and always get the first file). You should read all of the files in the same execution.

Merge files into xlsx and then reconstruct the dir

I have many files ('*.pl-pl'). My script has to find each of this files and merge them into one xlsx file using openpyxl.
Now, I want to rebuild those files, I want rebuild the same files as originals.
But there is a problem after writing:
(content variable contains content of one file (read from one excel cell))
with open(path,'w') as f:
f.write(content.encode('utf-8'))
So now, I check, whether original files are the same as new files. Text in those files seems to be the same but there are little differencies in size. When I use WinDiff application to check them, it finds some touples which are different but it says that they are different in blanks only.
Could you give me an advice how to rebuild those files to be the same as before?
Or is this way correct?
Note: I try to rebuild them to be sure that there will be the same encoding etc. because the merged excel file will be used to translation and then translated files has to be rebuilt instead of originals.
Here is the code - it checks directory and prints all file names and contents into the one temporary file. Then, it creates an excel file - 1st. column is path (to be able reconstruct dir) and 2nd column contains content of the file, where new lines has been switched to '='
def print_to_file():
import os
for root, dirs, files in os.walk("OriginalDir"):
for file in files:
text = []
if file.endswith(".pl-pl"):
abs_path = os.path.join(root, file)
with open(abs_path) as f:
for line in f:
text.append(line.strip('\n'))
mLib.printToFile('files.mdoc', abs_path + '::' + '*=*'.join(text)) #'*=*' represents '\n'
def write_it():
from openpyxl import Workbook
import xlsxwriter
file = 'files.mdoc'
workbook = Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = "Translate"
i = 0
with open(file) as f:
classes = set()
for line in f:
i += 1
splitted = line.strip('\n').split('::')
name = splitted[0]
text = splitted[1].split('*=*')
text = [x.encode('string-escape') for x in text]
worksheet.cell('B{}'.format(i)).style.alignment.wrap_text = True
worksheet.cell('B{}'.format(i)).value = splitted[1]
worksheet.cell('A{}'.format(i)).value = splitted[0]
workbook.save('wrap_text1.xlsx')
import openpyxl
def rebuild():
wb = openpyxl.load_workbook('wrap_text1.xlsx')
ws = wb.worksheets[0]
row_count = ws.get_highest_row()
for i in xrange(1, row_count + 1):
dir_file = ws.cell('A{}'.format(i)).value
content = ws.cell('B{}'.format(i)).value
remake(dir_file, content)
import os
def remake(path, content):
content = re.sub('\*=\*', '\n', content)
result = ''
splt = path.split('\\')
file = splt[-1]
for dir in splt[:-1]:
result += dir + '/'
# print result
if not os.path.isdir(result):
# print result
os.mkdir(result)
with open(path, 'w') as f:
f.write(content.encode('utf-8'))
# print_to_file() # print to temp file - paths and contents separated by '::'
# write_it() # write it into the excel file
# rebuilt() # reconstruct directory

How to use Python to find a string in a line and change the text n lines after the string

I need to find every instance of "translate" in a text file and replace a value 4 lines after finding the text:
"(many lines)
}
}
translateX xtran
{
keys
{
k 0 0.5678
}
}
(many lines)"
The value 0.5678 needs to be 0. It will always be 4 lines below the "translate" string
The file has up to about 10,000 lines.
example text file name: 01F.pz2.
I'd also like to cycle through the folder and repeat the process for every file with the pz2 extension (up to 40).
Any help would be appreciated!
Thanks.
I'm not quite sure about the logic for replacing 0.5678 in your file, therefore I use a function for that - change it to whatever you need, or explain more in details what you want. Last number in line? only floating-point number?
Try:
import os
dirname = "14432826"
lines_distance= 4
def replace_whatever(line):
# Put your logic for replacing here
return line.replace("0.5678", "0")
for filename in filter(lambda x:x.endswith(".pz2") and not x.startswith("m_"), os.listdir(dirname)):
print filename
with open(os.path.join(dirname, filename), "r") as f_in, open(os.path.join(dirname,"m_%s" % filename), "w") as f_out:
replace_tasks = []
for line in f_in:
# search marker in line
if line.strip().startswith("translate"):
print "Found marker in", line,
replace_tasks.append(lines_distance)
# replace if necessary
if len(replace_tasks)>0 and replace_tasks[0] == 0:
del replace_tasks[0]
print "line to change is", line,
line_to_write = replace_whatever(line)
else:
line_to_write = line
# Write to output
f_out.write(line_to_write)
# decrease counters
for i, task in enumerate(replace_tasks):
replace_tasks[i] -= 1
The comments within the code should help understanding. The main concept is the list replace_tasks that keeps record of when the next line to modify will come.
Remarks: Your code sample suggests that the data in your file are structured. It will definitely be saver to read this structure and work on it instead of search-and-replace approach on a plain text file.
Thorsten, I renamed my original files to have the .old extension and the following code works:
import os
target_dir = "."
# cycle through files
for path, dirs, files in os.walk(target_dir):
# file is the file counter
for file in files:
# get the filename and extension
filename, ext = os.path.splitext(file)
# see if the file is a pz2
if ext.endswith('.old') :
# rename the file to "old"
oldfilename = filename + ".old"
newfilename = filename + ".pz2"
old_filepath = os.path.join(path, oldfilename)
new_filepath = os.path.join(path, newfilename)
# open the old file for reading
oldpz2 = open (old_filepath,"r")
# open the new file for writing
newpz2 = open (new_filepath,"w")
# reset changeline
changeline = 0
currentline = 0
# cycle through old lines
for line in oldpz2 :
currentline = currentline + 1
if line.strip().startswith("translate"):
changeline = currentline + 4
if currentline == changeline :
print >>newpz2," k 0 0"
else :
print >>newpz2,line

Categories