How to combine the texts in subfolders with Python - python

I have a folder containing 4 subfolders and want to combine the texts in each of the subfolders (In others words, the production should be 4 combined texts respectively but not a whole text unifying all the subfolders).
The folder is like this
I want to use os.walk but have no result.
the code is below:
import os
rootdir=r'xxx\xxx\xxx'
allfiles = []
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir+d + 'combined.txt', 'w', encoding='utf-8')
for file in files:
allfiles.append(os.path.join(root, file))
for i in allfiles:
if i.endswith(r'.txt'):
f_in=open(i, 'r', encoding='utf-8')
for line in f_in.readlines():
f_out.write(line)
f_in.close()
f_out.close()

When you walk (through a storm)...
You'll need to log the files to the directory they were found in - as it is, when you get to your f_out.write() line, your way out of sync with what was happening on your reading loop.
Try this:
allFiles2 = {}
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir + d + 'combined.txt', 'w', encoding='utf-8')
allFiles2[os.path.join(root,d)] = []
for name in files:
fullName = os.path.join(root, name)
#At this point, we want to send the file to the correct dictionary position, so use the path defining the dictionary as a search item
for key in allFiles2:
if key == fullName[:len(key)]:
allFiles2[key].append(os.path.join(root, name))
for i1 in allFiles2: #Looping through the keys (directories) in allFiles2
with open(i1 + 'combined.txt', 'w', encoding='utf-8') as f_out:
for i2 in allFiles2[i1]: #Looping through the files in that directory
if i2.endswith('.txt'):
with open(i2, 'r') as f_in:
for line in f_in.readlines():
f_out.write(line)
f_out.write("\n")
Now, I'm not sure what shape you hope the output text files to be - but hopefully thats a useful starting point for you.

Related

turning multiple nested files into a single file (concatenating them)

I have a folder consisting of 7 files, each having several text files inside. I intend to read through them and write each of those nested text files into a single file called ZebraAllRaw.txt. In the end, there must be only one single file containing all the text files that existed in each of those 7 files.
This is the function I have written:
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'a', encoding="utf-8") as OutFile:
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")
However, it seems that all the content is written into the new file 9 times, as if it had read through them more than expected.
make sure you con't append the files from different runs.
I only replaced the file mode append with write at the open
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'w', encoding="utf-8") as OutFile: # mode "w", not "a"
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")

How to loop through each file in a folder, do some action to the file and save output to a file in another folder Python

I have a folder with multiple files like so:
1980
1981
1982
In each of these files is some text. I want to loop through each of these files and do some operation to each file then save the edited file to another folder and move onto the next file and so on. The result would be that I have the original folder and then another folder with the edited version of each file in it like so:
1980_filtered
1981_filtered
1982_filtered
Is it possible to do this?
Currently I have some code that loops through the files in a folder, does some filtering to each file and then saves all the edits of each file into one massive file. Here is my code:
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/mouse'
for root, dir, files in os.walk(input_location):
for file in files:
os.chdir(input_location)
with open(file, 'r') as f, open('NLTK-stop-word-list', 'r') as f2:
mouse_file = f.read().split() # reads file and splits it into a list
stopwords = f2.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
with open(output_location, 'a') as output_file:
output_file.write(x)
Any help would be greatly appreciated!
You need to specify what each new file is called. To do so, Python has some good string formatting methods. Fortunately, your new desired file names are easy to do in a loop
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/mouse'
for root, dir, files in os.walk(input_location):
for file in files:
new_file = "{}_filtered.txt".format(file)
os.chdir(input_location)
with open(file, 'r') as f, open('NLTK-stop-word-list', 'r') as f2:
mouse_file = f.read().split()
stopwords = f2.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
with open(output_location+'/'+new_file, 'w') as output_file: # Changed 'append' to 'write'
output_file.write(x)
If you're in Python 3.7, you can do
new_file = f"{file}_filtered.txt"
and
with open(f"{output_location}/{new_file}", 'w') as output_file:
output_file.write(x)
First of all you should start by opening the NLTK-stop-word-list only once, so I moved it outside of your loops. Second, os.chdir() is redundant, you can use os.path.join() to get your current file path (and to construct your new file path):
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/'
stop_words_path = 'C:/Users/User/Desktop/NLTK-stop-word-list.txt'
with open(stop_words_path, 'r') as stop_words:
for root, dirs, files in os.walk(input_location):
for name in files:
file_path = os.path.join(root, name)
with open(file_path, 'r') as f:
mouse_file = f.read().split() # reads file and splits it into a list
stopwords = stop_words.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
new_file_path = os.path.join(output_location, name) + '_filtered'
with open(new_file_path, 'a') as output_file:
output_file.write(x)
P.S: I took the liberty to change some of your variable names as they were part of python's built in words ('file' and 'dir'). If you'll run __builtins__.__dict__.keys() you'll see them there.

insert a word in lines and save the files using Python

I Have files in a directory that have many lines like this
cd /oasis/projects/nsf/ets100/
oconv /oasis/projects/nsf/ets100/rla
I would like to insert the word "sky/" so the second line reads like this
oconv /oasis/projects/nsf/sky/ets100/rla
What is the best way to do this?
I know that the beginning of the code will be something like this:
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
with open(os.path.join(root, file), 'r') as fRead:
if line.startswith("oconv"):
You can do something like this
def replace_line(file):
with open(os.path.abspath(file), 'r') as file_obj:
data = file_obj.readlines()
for index, lines in enumerate(data):
if lines.startswith('oconv'):
data[index] = lines.replace('/nsf/', '/nsf/sky/')
with open(os.path.abspath(file), 'w') as file_obj:
file_obj.writelines(data)
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
replace_line(file)
You can use regex to update the string:
import os
import re
path = r"c:\test"
for root, subFolders, files in os.walk('test'):
for file in files:
with open(os.path.join(root, file), 'r+') as fh: #open the file with 'r+' to read and write
content = fh.read()
new_content = re.sub(r'oconv \/oasis\/projects\/nsf\/(.*)', r'oconv /oasis/projects/nsf/sky/\1', content) #replace strings in file
fh.seek(0)
fh.truncate()
fh.write(new_content)

How to search through both zipped and unzipped folders for a specific line

I'm trying to implement a Python script that takes a folder from the user (can be zipped or unzipped), and search through all the files in the folder to output the specific lines that my regular expression matches. My code below works for regular unzipped folders, but I can't figure out how to do the same with zipped folders that are inputted to function. Below are my code, thanks in advance!
def myFunction(folder_name):
path = folder_name
for (path, subdirs, files) in os.walk(path):
files = [f for f in os.listdir(path) if f.endswith('.txt') or f.endswith('.log') or f.endswith('-release') or f.endswith('.out') or f.endswith('messages') or f.endswith('.zip')] # Specify here the format of files you hope to search from (ex: ".txt" or ".log")
files.sort() # file is sorted list
files = [os.path.join(path, name) for name in files] # Joins the path and the name, so the files can be opened and scanned by the open() function
# The following for loop searches all files with the selected format
for filename in files:
#print('start parsing... ' + str(datetime.datetime.now()))
matched_line = []
try:
with open(filename, 'r', encoding = 'utf-8') as f:
f = f.readlines()
except:
with open(filename, 'r') as f:
f = f.readlines()
# print('Finished parsing... ' + str(datetime.datetime.now()))
for line in f:
#0strip out \x00 from read content, in case it's encoded differently
line = line.replace('\x00', '')
RE2 = r'^Version: \d.+\d.+\d.\w\d.+'
RE3 = r'^.+version.(\d+.\d+.\d+.\d+)'
pattern2 = re.compile('('+RE2+'|'+RE3+')', re.IGNORECASE)
for match2 in pattern2.finditer(line):
matched_line.append(line)
print(line)
#Calling the function to use it
myFunction(r"SampleZippedFolder.zip")
The try and except block of my code was my attempt to open the zipped folder and read it. I'm still not very clear with how to open the zipped folder or how it works. Please let me know how I can modify my code to make it work, much appreciated!
One possibility is first determine what object type folder_name is using zipfile and os.isdir() and whichever one succeeds, get the list of files and proceed. Maybe something like this:
import zipfile, os, re
def myFunction(folder_name):
files = None # nothing yet
path = folder_name
if zipfile.is_zipfile(path):
print('ZipFile: {}'.format(path))
f = zipfile.ZipFile(path)
files = f.namelist()
# for name in f.namelist(): # debugging
# print('file: {}'.format(name))
elif os.path.isdir(path):
print('Folder: {}'.format(path))
files = os.listdir(path)
# for name in os.listdir(path): # debugging
# print('file: {}'.format(name))
# should now have a list of files
# proceed processing the files
for filename in files:
...

Code always writes filename in the output

I am trying to find which files have not had a relevant file with a similar filename (almost) so that I can generate them. But this code writes all file names basically whereas I want it to go through the first directory, go through the files and check if they have their equivilent _details.txt in the other folder, if not write the name.
I have in folder 1 those two 11.avi and 22.avi and in folder two only 11_details.txt , so am sure i should get one filename as a result
import os,fnmatch
a = open("missing_detailss5.txt", "w")
for root, dirs, files in os.walk("1/"):
for file1 in files:
if file1.endswith(".dat"):
for root, dirs, files in os.walk("2/"):
print(str(os.path.splitext(file1)[0]) + "_details.txt")
print(files)
if not (os.path.splitext(file1)[0] + "_details.txt") in files:
print(str(os.path.splitext(file1)[0]) + "_details.txt is missing")
a.write(str(os.path.splitext(file1)[0]) + "_details.txt" + os.linesep)
a.close()
here is my debug >>>
11_details.txt
['22_details.txt']
11_details.txt is missing
22_details.txt
['22_details.txt']
22_details.txt is missing
I just corrected your code directly without writing new code, you just missed a txt extension on the comparaison if.
import os
a = open("missing_detailss4.txt", "w")
for root, dirs, files in os.walk("1/"):
for file in files:
if file.endswith(".avi"):
for root, dirs, files in os.walk("2/"):
if not (str(os.path.splitext(file)[0]) + "_details.txt") in files:
a.write(str(os.path.splitext(file)[0]) + "_details.txt" + os.linesep)
a.close()
If I read your question correctly, the files ending in "_details.txt" are supposed to be in the same (relative) directory. That is, "1/some/path/file.avi" should have a corresponding file "2/some/path/file_details.txt". If that's the case, you need not iterate twice:
import os
with open("missing_detailss5.txt", "w") as outfile:
path1 = '1/'
path2 = '2/'
allowed_extensions = ['.dat', '.avi']
for root, dirs, files in os.walk(path1):
for file1 in files:
file1, ext = os.path.splitext(file)
if ext not in allowed_extensions: continue
path2 = os.path.join(path2, os.path.relpath(os.path.join(root, file1 + '_details.txt'), path1))
if not os.path.exists(path2):
print(os.path.basename(path2) + ' is missing.')
outfile.write(os.path.basename(path2) + os.linesep)
If you don't care about which extensions to check for in the first folder, then delete allowed_extensions = ['.dat', '.avi'] and if ext not in allowed_extensions: continue lines, and change file1, ext = os.path.splitext(file) to file1 = os.path.splitext(file)[0].

Categories