I am trying to iterate through a number .rtf files and for each file: read the file, perform some operations, and then write new files into a sub-directory as plain text files with the same name as the original file, but with .txt extensions. The problem I am having is with the file naming.
If a file is named foo.rtf, I want the new file in the subdirectory to be foo.txt. here is my code:
import glob
import os
import numpy as np
dir_path = '/Users/me/Desktop/test/'
file_suffix = '*.rtf'
output_dir = os.mkdir('sub_dir')
for item in glob.iglob(dir_path + file_suffix):
with open(item, "r") as infile:
reader = infile.readlines()
matrix = []
for row in reader:
row = str(row)
row = row.split()
row = [int(value) for value in row]
matrix.append(row)
np_matrix = np.array(matrix)
inv_matrix = np.transpose(np_matrix)
new_file_name = item.replace('*.rtf', '*.txt') # i think this line is the problem?
os.chdir(output_dir)
with open(new_file_name, mode="w") as outfile:
outfile.write(inv_matrix)
When I run this code, I get a Type Error:
TypeError: coercing to Unicode: need string or buffer, NoneType found
How can I fix my code to write new files into a subdirectory and change the file extensions from .rtf to .txt? Thanks for the help.
Instead of item.replace, check out some of the functions in the os.path module (http://docs.python.org/library/os.path.html). They're made for splitting up and recombining parts of filenames. For instance, os.path.splitext will split a filename into a file path and a file extension.
Let's say you have a file /tmp/foo.rtf and you want to move it to /tmp/foo.txt:
old_file = '/tmp/foo.rtf'
(file,ext) = os.path.splitext(old_file)
print 'File=%s Extension=%s' % (file,ext)
new_file = '%s%s' % (file,'.txt')
print 'New file = %s' % (new_file)
Or if you want the one line version:
old_file = '/tmp/foo.rtf'
new_file = '%s%s' % (os.path.splitext(old_file)[0],'.txt')
I've never used glob, but here's an alternative way without using a module:
You can easily strip the suffix using
name = name[:name.rfind('.')]
and then add the new suffix:
name = name + '.txt'
Why not using a function ?
def change_suffix(string, new_suffix):
i = string.rfind('.')
if i < 0:
raise ValueError, 'string does not have a suffix'
if not new_suffix[0] == '.':
new_suffix += '.'
return string[:i] + new_suffix
glob.iglob() yields pathnames, without the character '*'.
therefore your line should be:
new_file_name = item.replace('.rtf', '.txt')
consider working with clearer names (reserve 'filename' for a file name and use 'path' for a complete path to a file; use 'path_original' instead of 'item'), os.extsep ('.' in Windows) and os.path.splitext():
path_txt = os.extsep.join([os.path.splitext(path_original)[0], 'txt'])
now the best hint of all:
numpy can probably read your file directly:
data = np.genfromtxt(filename, unpack=True)
(see also here)
To better understand where your TypeError comes from, wrap your code in the following try/except block:
try:
(your code)
except:
import traceback
traceback.print_exc()
Related
Please help.
I'm searching several .txt files, in several directories for a pattern. If there is a match, I would like to print the filename and location of the match.
Here is my code:
a = ('Z:/rodney/020year/2020-04/')
b = []
for y in os.listdir(a):
b.append(a+y+'/')
for filename in b:
path = filename
for filenames in listdir(path):
with open(path+filenames) as currentfile:
text = currentfile.read()
loan = re.compile(r'2 NNN \d LOANS')
bb = loan.search(text)
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
f.write(bb)
Error message = "TypeError: write() argument must be str, not None"
If there is a match, I would like to see only the filename and location of a match. I do not need to see "None" in every file where there is no match.
You have:
bb = loan.search(text)
But if the string you are looking for is not found in text, bb will ne None and consequently f.write(bb) will raise an exception (you did not indicate which line of code was raising the exception, so this is an educated guess).
You need to modify your code to be:
bb = loan.search(text)
if bb:
with open('z:/rodney/results.txt','a') as f:
f.write(os.path.dirname(path)+' ')
f.write(filenames[:-4]+'\n')
As an aside:
You have the statement loan = re.compile(r'2 NNN \d LOANS') in a loop. There is no need for that to be in a loop since it is invariant.
You can avoid using string slicing and bunch of functions to parse file path by using pathlib, where most of needed cases are already implemented. Also you can optimize your code by moving re.compile() out of loop (create once and use). Same with writing result - you don't need to reopen file every time, just open it once before loop start.
Optimized code:
from pathlib import Path
import re
src_dir = Path(r"Z:\rodney\020year\2020-04")
res_fn = r"z:\rodney\results.txt"
with open(res_fn, "w+") as res_f:
search_re = re.compile(r"2\sN{3}\s{28}\d\sLOANS")
for directory in src_dir.iterdir():
if directory.is_dir():
for file in directory.iterdir():
if file.is_file():
with open(file) as of:
bb = search_re.search(of.read())
if bb:
print(file.parent, file.stem, file=res_f)
print(bb.group(), file=res_f)
# res_f.write(file.parent + " " + file.stem + "\n" + bb.group())
Based on your source code, I optimized it.
I use os.walk to access each .txt file and then read it line by line in those txt files and save it in an enum. Then I will check each line in that enum with regex (I referenced Olvin Roght-san). If there is a match, it will print out the exact file location and line for you.
import os
import re
extension = [".txt"]
regex = r"2\sN{3}\s{28}\d\sLOANS"
re_Search = re.compile(regex)
path = "Z:\rodney\020year\2020-04"
for subdir, dirs, files in os.walk(path):
for file in files:
file_path = os.path.join(subdir, file)
ext = os.path.splitext(file)[-1].lower()
if ext in extension:
with open(file_path, "r") as f:
try:
f_content = f.readlines()
except Exception as e:
print(e)
for l_idx, line in enumerate(f_content):
if re_Search.search(line):
print(file_path)
print("At line: {l_idx}".format(l_idx = l_idx+1))
else:
print("Nothing!!")
I'm very green when it comes to Python, so please forgive my disgusting formatting or poor optimization.
I'm trying to write a script to sort files into new folders based on their name.
In order to match their name to the correct new location, I have a csv file with two columns; the first is part of the name of the file, and the second is the correct folder it belongs in.
So far I have everything written to extract the parts of the file names I need, but now I'm stuck as to how I can match the strings I have to a value in the csv, and then extract the adjacent column.
This is what I have so far:
import os
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
def findDemoName(fileName):
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory):
sortingCSV = openCSV(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
for filename in os.listdir(srcDir):
name = findDemoName(filename)
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv used for sorting>'
inputDirectory = '<where the files are located>'
outputDirectory = '<where I want to move the files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory)
Right now it just prints the extracted portion of the file name and prints it so I could make sure it was doing what I wanted.
So my next steps are
1. Match the extracted portion of the file name to a matching value in the first column of the csv
2. Take the value adjacent to the match and use it to complete the destination path for the file to be moved to
I found this thread match names in csv file to filename in folder, but I don't understand where in the answer the csv is being matched to.
If I need to clear up some points let me know and I will.
Thank you in advance for reading :)
EDIT:
I've tried to stumble my way through this, and here's what I have so far:
import os, shutil
import csv
def openCSV(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
data = list(reader)
return data
"""def createReader(csvFile):
file = open(csvFile)
reader = csv.DictReader(file)
return reader"""
def extractDemoName(fileName):
originalName = fileName
demoName = fileName[16:]
demoName = demoName[:-11]
return demoName
def moveFiles(sortingFile, sourceDirectory, destinationDirectory, prefix, suffix):
reader = openCSV(sortingFile)
#reader = createReader(sortingFile)
srcDir = sourceDirectory
destDir = destinationDirectory
column1 = 'DemographicName'
column2 = 'DemographicTypeName'
folder = ''
for filename in os.listdir(srcDir):
name = extractDemoName(filename)
for row in reader:
if row(column1) == name:
folder = row(column2)
destination = destDir + folder
file = prefix + name + suffix
shutil.copy(file, destination)
print('Moved ' + file + ' to ' + destination)
#else reader.next()
print(name)
# begin program
if __name__ == "__main__":
# set the CSV used to sort the files
fileToSortFrom = '<csv file>'
inputDirectory = '<source path>'
outputDirectory = '<destination path>'
filePrefix = '<beginning text of files>'
fileSuffix = '<ending text of files>'
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
But now I'm receiving the following error instead:
Traceback (most recent call last):
File "script.py", line 63, in <module>
moveFiles(fileToSortFrom, inputDirectory, outputDirectory, filePrefix, fileSuffix)
File "script.py", line 38, in moveFiles
if row(column1) == name:
TypeError: 'collections.OrderedDict' object is not callable
There is the problem (line 38)
if row(column1) == name:
it should be
if row[column1] == name:
I haven't checked any other logic in the script :)
This script reads the files from the directory you pass in method move_files's from_dir.
It checks if the file in the from_dir exists in the csv_file and if it does, it gets the location and moves it to that directory.
import os
import csv
import shutil
def get_file_sorter_dict(csv_file):
return dict(list(csv.reader(open(csv_file))))
def move_files(csv_file, from_dir, to_dir):
file_sorter_dict = get_file_sorter_dict(csv_file)
for filename in os.listdir(from_dir):
if file_sorter_dict.get(filename):
# you can use the location to move the file from csv_file
# move_to = file_sorter_dict.get(filename)
# shutil.move(filename, move_to)
# or you can use to_dir to move the file.
shutil.move(filename, to_dir)
if __name__ == "__main__":
move_files('files_sorter.csv', '.', '../')
The csv I am using looks like:
name, location
"foo.txt","../"
"baz.txt","../"
I'm working on a small python 3 utility to build a zip file based on a list of file extensions. I have a text file of extensions and I'm passing a folder into the script:
working_folder = sys.argv[1]
zip_name = sys.argv[2]
#Open the extension file
extensions = []
with open('CxExt.txt') as fp:
lines = fp.readlines()
for line in lines:
extensions.append(line)
#Now get the files in the directory. If they have the right exttension add them to the list.
files = os.listdir(working_folder)
files_to_zip = []
for ext in extensions:
results = glob.glob(working_folder + '**/' + ext, recursive=True)
print(str(len(results)) + " results for " + working_folder + '**/*' + ext)
#search = "*"+ext
#results = [y for x in os.walk(working_folder) for y in glob(os.path.join(x[0], search))]
#results = list(Path(".").rglob(search))
for result in results:
files_to_zip.append(result)
if len(files_to_zip) == 0:
print("No Files Found")
sys.exit()
for f in files:
print("Checking: " + f)
filename, file_extension = os.path.splitext(f)
print(file_extension)
if file_extension in extensions:
print(f)
files_to_zip.append(file)
ZipFile = zipfile.ZipFile(zip_name, "w" )
for z in files_to_zip:
ZipFile.write(os.path.basename(z), compress_type=zipfile.ZIP_DEFLATED)
I've tried using glob, os.walk, and Path.rglob and I still can't get a list of files. There's got to be something just obvious that I'm missing. I built a test directory that has some directories, py files, and a few zip files. It returns 0 for all file types. What am I overlooking?
This is my first answer, so please don't expect it to be perfect.
I notice you're using file.readlines(). According to the Python docs here, file.readlines() returns a list of lines including the newline at the end. If your text file has the extensions separated by newlines, maybe try using file.read().split("\n") instead. Besides that, your code looks okay. Tell me if this fix doesn't work.
Lets say I have n files in a directory with filenames: file_1.txt, file_2.txt, file_3.txt .....file_n.txt. I would like to import them into Python individually and then do some computation on them, and then store the results into n corresponding output files: file_1_o.txt, file_2_o.txt, ....file_n_o.txt.
I've figured out how to import multiple files:
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
...
np.savetxt(file, ) ???
Not quite sure how to append the _o.txt (or any string for that matter) after the filename so that the output file is file_1_o.txt
Can you use the following snippet to build the output filename?
parts = in_filename.split(".")
out_filename = parts[0] + "_o." + parts[1]
where I assumed in_filename is of the form "file_1.txt".
Of course would probably be better to put "_o." (the suffix before the extension) in a variable so that you can change at will just in one place and have the possibility to change that suffix more easily.
In your case it means
import glob
import numpy as np
path = r'home\...\CurrentDirectory'
allFiles = glob.glob(path + '/*.txt')
for file in allFiles:
# do something to file
...
parts = file.split(".")
out_filename = parts[0] + "_o." + parts[1]
np.savetxt(out_filename, ) ???
but you need to be careful, since maybe before you pass out_filename to np.savetxt you need to build the full path so you might need to have something like
np.savetxt(os.path.join(path, out_filename), )
or something along those lines.
If you would like to combine the change in basically one line and define your "suffix in a variable" as I mentioned before you could have something like
hh = "_o." # variable suffix
..........
# inside your loop now
for file in allFiles:
out_filename = hh.join(file.split("."))
which uses another way of doing the same thing by using join on the splitted list, as mentioned by #NathanAck in his answer.
import os
#put the path to the files here
filePath = "C:/stack/codes/"
theFiles = os.listdir(filePath)
for file in theFiles:
#add path name before the file
file = filePath + str(file)
fileToRead = open(file, 'r')
fileData = fileToRead.read()
#DO WORK ON SPECIFIC FILE HERE
#access the file through the fileData variable
fileData = fileData + "\nAdd text or do some other operations"
#change the file name to add _o
fileVar = file.split(".")
newFileName = "_o.".join(fileVar)
#write the file with _o added from the modified data in fileVar
fileToWrite = open(newFileName, 'w')
fileToWrite.write(fileData)
#close open files
fileToWrite.close()
fileToRead.close()
I am trying to assign the elements of a list as names for some files that live in a directory, so far I created a function that recover the name of a each file from a directory and returns them in a list:
def retrive(directory_path):
path_names = []
for filename in sorted(glob.glob(os.path.join(directory_path, '*.pdf'))):
retrieved_files = filename.split('/')[-1]
path_names.append(retrieved_files)
print (path_names)
The above function returns in a list the names of each file, then I am writing the files into another directory as follows:
path = os.path.join(new_dir_path, "list%d.txt" % i)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Finally, my question is: how can I assign as a name of each file, each element of path_names?, something like this line:
path = os.path.join(new_dir_path, "list%d.txt" % i)
I also tried to use the format() function. However I still cant assign the the correct name to each file.
Here's the full script:
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from tika import parser
all_texts = []
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
texts = parsed['content']
all_texts.append(texts)
for i , a_list in enumerate(all_texts):
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, "list%d.txt" % i)
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
The desired output will consist of the actual names of each processed file.
You’re almost there:
for path_name in path_names:
path = os.path.join(new_dir_path, "list%s.txt" % path_name)
#This is the path of each new file:
#print(path)
with codecs.open(path, "w", encoding='utf8') as f:
for item in [a_list]:
f.write(item+"\n")
Update based on updated code sample. You are using different loops here, and that is not ideal unless you are doing processing in between the two loops. Since I am going to keep that structure, we are going to have to make sure to associate each block of content with the original filename. The best structure for that is a dict, and in case order is important, we use an OrderedDict. Now, when we’re looping over the filename, content pairs in the OrderedDict we’ll want to change the extension of the file to match the new file type. Luckily, python has some nice utilities for file/path manipulation in the os.path module. os.path.basename can be used to strip off the directory from a file and os.path.splitext will strip off an extension from a filename. We use both of those to get just the filename without the extension and then append .txt to designate the new file type. Putting it all together, we get :
def transform_directoy(input_directory, output_directory):
import codecs, glob, os
from collections import OrderedDict
from tika import parser
all_texts = OrderedDict()
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
filename = os.path.basename(filename)
texts = parsed['content']
all_texts[filename] = texts
for i, (original_filename, a_list) in enumerate(all_texts.items()):
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
new_dir_path = output_directory
#print(new_dir_path)
path = os.path.join(new_dir_path, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
with codecs.open(path, "w", encoding='utf8') as filename:
for item in [a_list]:
filename.write(item+"\n")
Second update: OP asked how I would write this code if this was all that there was, so here goes:
# move imports to top of file: PEP 8
import codecs, glob, os
from tika import parser
def transform_directoy(input_directory, output_directory):
for filename in sorted(glob.glob(os.path.join(input_directory, '*.pdf'))):
parsed = parser.from_file(filename)
parsed_content = parsed['content']
original_filename = os.path.basename(filename)
new_filename, _ = os.path.splitext(original_filename)
new_filename += '.txt'
path = os.path.join(output_directory, new_filename)
# Print out the name of the file we are processing
print('Transforming %s => %s' % (original_filename, path,))
# no need for a second loop since we can piggy back off the first loop
with codecs.open(path, "w", encoding='utf8') as filename:
# No need for a for loop here since our list only has one item
filename.write(parsed_content)
filename.write("\n")