Merge files into xlsx and then reconstruct the dir

Merge files into xlsx and then reconstruct the dir - python

I have many files ('*.pl-pl'). My script has to find each of this files and merge them into one xlsx file using openpyxl.
Now, I want to rebuild those files, I want rebuild the same files as originals.
But there is a problem after writing:
(content variable contains content of one file (read from one excel cell))
with open(path,'w') as f:
f.write(content.encode('utf-8'))
So now, I check, whether original files are the same as new files. Text in those files seems to be the same but there are little differencies in size. When I use WinDiff application to check them, it finds some touples which are different but it says that they are different in blanks only.
Could you give me an advice how to rebuild those files to be the same as before?
Or is this way correct?
Note: I try to rebuild them to be sure that there will be the same encoding etc. because the merged excel file will be used to translation and then translated files has to be rebuilt instead of originals.
Here is the code - it checks directory and prints all file names and contents into the one temporary file. Then, it creates an excel file - 1st. column is path (to be able reconstruct dir) and 2nd column contains content of the file, where new lines has been switched to '='
def print_to_file():
import os
for root, dirs, files in os.walk("OriginalDir"):
for file in files:
text = []
if file.endswith(".pl-pl"):
abs_path = os.path.join(root, file)
with open(abs_path) as f:
for line in f:
text.append(line.strip('\n'))
mLib.printToFile('files.mdoc', abs_path + '::' + '*=*'.join(text)) #'*=*' represents '\n'
def write_it():
from openpyxl import Workbook
import xlsxwriter
file = 'files.mdoc'
workbook = Workbook()
worksheet = workbook.worksheets[0]
worksheet.title = "Translate"
i = 0
with open(file) as f:
classes = set()
for line in f:
i += 1
splitted = line.strip('\n').split('::')
name = splitted[0]
text = splitted[1].split('*=*')
text = [x.encode('string-escape') for x in text]
worksheet.cell('B{}'.format(i)).style.alignment.wrap_text = True
worksheet.cell('B{}'.format(i)).value = splitted[1]
worksheet.cell('A{}'.format(i)).value = splitted[0]
workbook.save('wrap_text1.xlsx')
import openpyxl
def rebuild():
wb = openpyxl.load_workbook('wrap_text1.xlsx')
ws = wb.worksheets[0]
row_count = ws.get_highest_row()
for i in xrange(1, row_count + 1):
dir_file = ws.cell('A{}'.format(i)).value
content = ws.cell('B{}'.format(i)).value
remake(dir_file, content)
import os
def remake(path, content):
content = re.sub('\*=\*', '\n', content)
result = ''
splt = path.split('\\')
file = splt[-1]
for dir in splt[:-1]:
result += dir + '/'
# print result
if not os.path.isdir(result):
# print result
os.mkdir(result)
with open(path, 'w') as f:
f.write(content.encode('utf-8'))
# print_to_file() # print to temp file - paths and contents separated by '::'
# write_it() # write it into the excel file
# rebuilt() # reconstruct directory

Related

Searching excel files for string in file through multiple folder directories not working

I have this code where I am trying to search a Directory and Sub Directories for a specified string within .xls and .xlsx files and return the file name for now. When I run this - I get a return of each file directory path as text for the files ending in .xls and .xlsx and the search string parameter I use under those same returned results. The code is not isolating the files with the string - rather, just returning the file path as text for all results and adding my string parameter to search for under that. What could be happening here? and is it possible to pass a list here and copy the discovered files to a folder? That is where I am trying to get with this in the end. Thank you.
import os
import openpyxl
def findFiles(strings, dir, subDirs, fileContent, fileExtensions):
filesInDir = []
foundFiles = []
filesFound = 0
if not subDirs:
for filename in os.listdir(dir):
if os.path.isfile(os.path.join(dir, filename).replace("\\", "/")):
filesInDir.append(os.path.join(dir, filename).replace("\\", "/"))
else:
for root, subdirs, files in os.walk(dir):
for f in files:
if not os.path.isdir(os.path.join(root, f).replace("\\", "/")):
filesInDir.append(os.path.join(root, f).replace("\\", "/"))
print(filesInDir)
if filesInDir:
for file in filesInDir:
print("Current file: "+file)
filename, extension = os.path.splitext(file)
if fileExtensions:
fileText = extension
else:
fileText = os.path.basename(filename).lower()
if fileContent:
fileText += getFileContent(file).lower()
for string in strings:
print(string)
if string in fileText:
foundFiles.append(file)
filesFound += 1
break
return foundFiles
def getFileContent(filename):
if filename.partition(".")[2] in supportedTypes:
if filename.endswith(".xls"):
content = ""
with openpyxl.load_workbook(filename) as pdf:
for x in range(0, len(pdf.pages)):
page = pdf.pages[x]
content = content + page.extract_text()
return content
elif filename.endswith(".xlsx"):
with openpyxl.load_workbook(filename, 'r') as f:
content = ""
lines = f.readlines()
for x in lines:
content = content + x
f.close()
return content
else:
return ""
supportedTypes = [".xls", ".xlsx"]
print(findFiles(strings=["55413354"], dir="C:/Users/User/", subDirs=True, fileContent=True, fileExtensions=False))
Expected output sample - reflects a find for string '55413354` - as in, that string was located in below file name only out of 3 files.
Excel File Name 123
Actual output - Returns everything - no filter is happening, and includes my search string under the file name.
path/Excel File Name 123
55413354
path/Excel File Name 321
55413354
path/Excel File Name 111
55413354

Creating new files through loop using python3

I have a Dataset, which has 5 folders, in which each folder has 100 .txt files. Below code you can see that I am looping through every file, and removing certain words from those files using my StopWords.txt file.
After I remove the words I am appending the output in one file(filteredtext.txt). But I want to have these output exactly as my Dataset (5 folders which has 100 .txt file).
This is my code.
import re
import os
#insert stopwords files
stopwordfile = open("StopWords.txt", encoding='utf-8')
# Use this to read file content as a stream:
readstopword = stopwordfile.read()
stop_words = readstopword.split()
#file path to dataset
for path, _, files in os.walk("sinhala-set1"):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
file1 = open(filepath, encoding='utf-8')
# Use this to read file content as a stream:
line = file1.read()
words = line.split()
for r in words:
if not r in stop_words:
appendFile = open('filteredtext.txt','a', encoding='utf-8')
appendFile.write(" "+r)
appendFile.close()

You are appending the file because you are opening the same .txt file with appending mode appendFile = open('filteredtext.txt','a', encoding='utf-8') If you want a separate file for each loop, open a different file like this:
output_file = open('output_' + file_name), 'w', encoding='utf-8')

Python - match string to csv value, then extract adjacent column

I'm very green when it comes to Python, so please forgive my disgusting formatting or poor optimization.
I'm trying to write a script to sort files into new folders based on their name.
In order to match their name to the correct new location, I have a csv file with two columns; the first is part of the name of the file, and the second is the correct folder it belongs in.
So far I have everything written to extract the parts of the file names I need, but now I'm stuck as to how I can match the strings I have to a value in the csv, and then extract the adjacent column.
This is what I have so far:
import os
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
def findDemoName(fileName):
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory):
sortingCSV = openCSV(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
for filename in os.listdir(srcDir):
name = findDemoName(filename)
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv used for sorting>'
inputDirectory = '<where the files are located>'
outputDirectory = '<where I want to move the files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory)
Right now it just prints the extracted portion of the file name and prints it so I could make sure it was doing what I wanted.
So my next steps are
1. Match the extracted portion of the file name to a matching value in the first column of the csv
2. Take the value adjacent to the match and use it to complete the destination path for the file to be moved to
I found this thread match names in csv file to filename in folder, but I don't understand where in the answer the csv is being matched to.
If I need to clear up some points let me know and I will.
Thank you in advance for reading :)
EDIT:
I've tried to stumble my way through this, and here's what I have so far:
import os, shutil
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
"""def createReader(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
return reader"""
def extractDemoName(fileName):
originalName = fileName
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory, prefix, suffix):
reader = openCSV(sortingFile)
#reader = createReader(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
column1 = 'DemographicName'
column2 = 'DemographicTypeName'
folder = ''
for filename in os.listdir(srcDir):
name = extractDemoName(filename)
for row in reader:
if row(column1) == name:
folder = row(column2)
destination = destDir + folder
file = prefix + name + suffix
shutil.copy(file, destination)
print('Moved ' + file + ' to ' + destination)
#else reader.next()
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv file>'
inputDirectory = '<source path>'
outputDirectory = '<destination path>'
filePrefix = '<beginning text of files>'
fileSuffix = '<ending text of files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
But now I'm receiving the following error instead:
Traceback (most recent call last):
File "script.py", line 63, in <module>
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
File "script.py", line 38, in moveFiles
if row(column1) == name:
TypeError: 'collections.OrderedDict' object is not callable

There is the problem (line 38)
if row(column1) == name:
it should be
if row[column1] == name:
I haven't checked any other logic in the script :)

This script reads the files from the directory you pass in method move_files's from_dir.
It checks if the file in the from_dir exists in the csv_file and if it does, it gets the location and moves it to that directory.
import os
import csv
import shutil
def get_file_sorter_dict(csv_file):
return dict(list(csv.reader(open(csv_file))))
def move_files(csv_file, from_dir, to_dir):
file_sorter_dict = get_file_sorter_dict(csv_file)
for filename in os.listdir(from_dir):
if file_sorter_dict.get(filename):
# you can use the location to move the file from csv_file
# move_to = file_sorter_dict.get(filename)
# shutil.move(filename, move_to)
# or you can use to_dir to move the file.
shutil.move(filename, to_dir)
if __name__ == "__main__":
move_files('files_sorter.csv', '.', '../')
The csv I am using looks like:
name, location
"foo.txt","../"
"baz.txt","../"

Splitting CSV file into multiple sheets in an Excel file based on row limit argument

Hi I am trying to run a utility script i found in github
https://gist.github.com/Athmailer/4cdb424f03129248fbb7ebd03df581cd
Update 1:
Hi I modified the logic a bit more so that rather than splitting the csv into multiple csvs again i am creating a single excel file with multiple sheets containing the splits. Below is my code
import os
import csv
import openpyxl
import argparse
def find_csv_filenames( path_to_dir, suffix=".csv" ):
filenames = os.listdir(path_to_dir)
return [ filename for filename in filenames if filename.endswith( suffix ) ]
def is_binary(filename):
"""
Return true if the given filename appears to be binary.
File is considered to be binary if it contains a NULL byte.
FIXME: This approach incorrectly reports UTF-16 as binary.
"""
with open(filename, 'rb') as f:
for block in f:
if '\0' in block:
return True
return False
def split(filehandler, delimiter=',', row_limit=5000,
output_name_template='.xlsx', output_path='.', keep_headers=True):
class MyDialect(csv.excel):
def __init__(self, delimiter=','):
self.delimiter = delimiter
lineterminator = '\n'
my_dialect = MyDialect(delimiter=delimiter)
reader = csv.reader(filehandler, my_dialect)
index = 0
current_piece = 1
# Create a new Excel workbook
# Create a new Excel sheet with name Split1
current_out_path = os.path.join(
output_path,
output_name_template
)
wb = openpyxl.Workbook()
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
current_limit = row_limit
if keep_headers:
headers = reader.next()
ws.append(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
ws = wb.create_sheet(index=index, title="Split" + str(current_piece))
if keep_headers:
ws.append(headers)
ws.append(row)
wb.save(current_out_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Splits a CSV file into multiple pieces.',
prefix_chars='-+')
parser.add_argument('-l', '--row_limit', type=int, default=5000,
help='The number of rows you want in each output file. (default: 5000)')
args = parser.parse_args()
#Check if output path exists else create new output folder
output_path='Output'
if not os.path.exists(output_path):
os.makedirs(output_path)
with open('Logger.log', 'a+') as logfile:
logfile.write('Filename --- Number of Rows\n')
logfile.write('#Unsplit\n')
#Get list of all csv's in the current folder
filenames = find_csv_filenames(os.getcwd())
filenames.sort()
rem_filenames = []
for filename in filenames:
if is_binary(filename):
logfile.write('{} --- binary -- skipped\n'.format(filename))
rem_filenames.append(filename)
else:
with open(filename, 'rb') as infile:
reader_file = csv.reader(infile,delimiter=";",lineterminator="\n")
value = len(list(reader_file))
logfile.write('{} --- {} \n'.format(filename,value))
filenames = [item for item in filenames if item not in rem_filenames]
filenames.sort()
logfile.write('#Post Split\n')
for filename in filenames:
#try:
with open(filename, 'rb') as infile:
name = filename.split('.')[0]
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
I have a folder called 'CSV Files' which contains a lot of csv's which need to be split.
I am keeping this utility script in the same folder
Getting the following error on running the script:
Traceback (most recent call last):
File "csv_split.py", line 96, in <module>
split(filehandler=infile,delimiter=';',row_limit=args.row_limit,output_name_template= name + '.xlsx',output_path='Output')
File "csv_split.py", line 57, in split
ws.append(row)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/worksheet/worksheet.py", line 790, in append
cell = Cell(self, row=row_idx, col_idx=col_idx, value=content)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 114, in __init__
self.value = value
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 294, in value
self._bind_value(value)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 191, in _bind_value
value = self.check_string(value)
File "/home/ramakrishna/.local/lib/python2.7/site-packages/openpyxl/cell/cell.py", line 156, in check_string
raise IllegalCharacterError
openpyxl.utils.exceptions.IllegalCharacterError
Can some one let me know if i have to add another for loop and go each cell in the row and append it to the sheet or can it be done in a single go. Also I seem to have made this logic a lot clumsy can this be optimized further.
Folder structure for your reference

You must pass just a name of the file as command line argument:
python splitter.py 'Sports & Outdoors 2017-08-26'
Also, I tried running the above script and no matter on what CSS I run it, it doesn't return the first line (which should normally be a header) although keep_headers = True. Setting keep_headers = False also prints out the header line, which is a bit counterintuitive.
This script is meant to read a single CSV. If you want to read every CSV in a directory, you want to make another script that will loop through all the files in that directory.
import splitter as sp
import os
files = [ f for f in os.listdir('/your/directory') if f[-4:] == '.csv' ]
for file in files:
with open(file, 'r') as f:
sp.split(f)

Exporting multiple files with different filenames

Lets say I have n files in a directory with filenames: file_1.txt, file_2.txt, file_3.txt .....file_n.txt. I would like to import them into Python individually and then do some computation on them, and then store the results into n corresponding output files: file_1_o.txt, file_2_o.txt, ....file_n_o.txt.
I've figured out how to import multiple files:
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
...
np.savetxt(file, ) ???
Not quite sure how to append the _o.txt (or any string for that matter) after the filename so that the output file is file_1_o.txt

Can you use the following snippet to build the output filename?
parts = in_filename.split(".")
out_filename = parts[0] + "_o." + parts[1]
where I assumed in_filename is of the form "file_1.txt".
Of course would probably be better to put "_o." (the suffix before the extension) in a variable so that you can change at will just in one place and have the possibility to change that suffix more easily.
In your case it means
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
parts = file.split(".")
out_filename = parts[0] + "_o." + parts[1]
np.savetxt(out_filename, ) ???
but you need to be careful, since maybe before you pass out_filename to np.savetxt you need to build the full path so you might need to have something like
np.savetxt(os.path.join(path, out_filename), )
or something along those lines.
If you would like to combine the change in basically one line and define your "suffix in a variable" as I mentioned before you could have something like
hh = "_o." # variable suffix
..........
# inside your loop now
for file in allFiles:
out_filename = hh.join(file.split("."))
which uses another way of doing the same thing by using join on the splitted list, as mentioned by #NathanAck in his answer.

import os
#put the path to the files here
filePath = "C:/stack/codes/"
theFiles = os.listdir(filePath)
for file in theFiles:
#add path name before the file
file = filePath + str(file)
fileToRead = open(file, 'r')
fileData = fileToRead.read()
#DO WORK ON SPECIFIC FILE HERE
#access the file through the fileData variable
fileData = fileData + "\nAdd text or do some other operations"
#change the file name to add _o
fileVar = file.split(".")
newFileName = "_o.".join(fileVar)
#write the file with _o added from the modified data in fileVar
fileToWrite = open(newFileName, 'w')
fileToWrite.write(fileData)
#close open files
fileToWrite.close()
fileToRead.close()

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Merge files into xlsx and then reconstruct the dir - python

Related

Searching excel files for string in file through multiple folder directories not working

Creating new files through loop using python3

Python - match string to csv value, then extract adjacent column

Splitting CSV file into multiple sheets in an Excel file based on row limit argument

Exporting multiple files with different filenames

Categories

Resources