I have a folder consisting of 7 files, each having several text files inside. I intend to read through them and write each of those nested text files into a single file called ZebraAllRaw.txt. In the end, there must be only one single file containing all the text files that existed in each of those 7 files.
This is the function I have written:
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'a', encoding="utf-8") as OutFile:
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")
However, it seems that all the content is written into the new file 9 times, as if it had read through them more than expected.
make sure you con't append the files from different runs.
I only replaced the file mode append with write at the open
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'w', encoding="utf-8") as OutFile: # mode "w", not "a"
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")
Related
So I have some .txt files inside of directory. Each .txt file contains some paths like:
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c'
I need just some small function that will go through each line of each file inside of a dir and remove there ', so only clear path is left like:
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c
My code at the moment is:
for filename in files:
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
if line.startswith('')and line.endswith(''):
remove('')
Please assist!
SOLUTION:
I have managed to find a solution with a bit different approach:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks!
python has a hirarchy to strings ', ", "" and so on so you can wrap a uptick into quotes for a split. Since we have the first element '' before the tick the second is your path
line.split("'")[1]
Edit: If i understood you correctly you want this
for filename in files:
paths = []
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
paths.append(line.split("'")[1])
file.close()
with open(filename, 'w') as file:
file.writelines(paths)
file.close()
Soo I just did bit different approach and managed to find a solution:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks guys anyway!
I have a folder containing 4 subfolders and want to combine the texts in each of the subfolders (In others words, the production should be 4 combined texts respectively but not a whole text unifying all the subfolders).
The folder is like this
I want to use os.walk but have no result.
the code is below:
import os
rootdir=r'xxx\xxx\xxx'
allfiles = []
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir+d + 'combined.txt', 'w', encoding='utf-8')
for file in files:
allfiles.append(os.path.join(root, file))
for i in allfiles:
if i.endswith(r'.txt'):
f_in=open(i, 'r', encoding='utf-8')
for line in f_in.readlines():
f_out.write(line)
f_in.close()
f_out.close()
When you walk (through a storm)...
You'll need to log the files to the directory they were found in - as it is, when you get to your f_out.write() line, your way out of sync with what was happening on your reading loop.
Try this:
allFiles2 = {}
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir + d + 'combined.txt', 'w', encoding='utf-8')
allFiles2[os.path.join(root,d)] = []
for name in files:
fullName = os.path.join(root, name)
#At this point, we want to send the file to the correct dictionary position, so use the path defining the dictionary as a search item
for key in allFiles2:
if key == fullName[:len(key)]:
allFiles2[key].append(os.path.join(root, name))
for i1 in allFiles2: #Looping through the keys (directories) in allFiles2
with open(i1 + 'combined.txt', 'w', encoding='utf-8') as f_out:
for i2 in allFiles2[i1]: #Looping through the files in that directory
if i2.endswith('.txt'):
with open(i2, 'r') as f_in:
for line in f_in.readlines():
f_out.write(line)
f_out.write("\n")
Now, I'm not sure what shape you hope the output text files to be - but hopefully thats a useful starting point for you.
I have a Dataset, which has 5 folders, in which each folder has 100 .txt files. Below code you can see that I am looping through every file, and removing certain words from those files using my StopWords.txt file.
After I remove the words I am appending the output in one file(filteredtext.txt). But I want to have these output exactly as my Dataset (5 folders which has 100 .txt file).
This is my code.
import re
import os
#insert stopwords files
stopwordfile = open("StopWords.txt", encoding='utf-8')
# Use this to read file content as a stream:
readstopword = stopwordfile.read()
stop_words = readstopword.split()
#file path to dataset
for path, _, files in os.walk("sinhala-set1"):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
file1 = open(filepath, encoding='utf-8')
# Use this to read file content as a stream:
line = file1.read()
words = line.split()
for r in words:
if not r in stop_words:
appendFile = open('filteredtext.txt','a', encoding='utf-8')
appendFile.write(" "+r)
appendFile.close()
You are appending the file because you are opening the same .txt file with appending mode appendFile = open('filteredtext.txt','a', encoding='utf-8') If you want a separate file for each loop, open a different file like this:
output_file = open('output_' + file_name), 'w', encoding='utf-8')
I have a dictionnary that group different pattern :
dico_cluster={'cluster_1': ['CUX2', 'CUX1'], 'cluster_2': ['RFX3', 'RFX2'],'cluster_3': ['REST']}
Then I have files in a folder :
"/path/to/test/files/CUX1.txt"
"/path/to/test/files/CUX2.txt"
"/path/to/test/files/RFX3.txt"
"/path/to/test/files/RFX2.txt"
"/path/to/test/files/REST.txt"
"/path/to/test/files/ZEB.txt"
"/path/to/test/files/TEST.txt"
I'm trying to concatenate the files that are in the same cluster. The output file name should be the name of pattern join by underscore "_"
I tried this :
filenames = glob.glob('/path/to/test/files/*.txt')
for clee in dico_cluster.keys():
fname='_'.join(dico_cluster[clee])
outfilename ='/path/to/test/outfiles/'+ fname + ".txt"
for file in filenames:
tf_file=file.split('/')[-1].split('.')[0]
if tf_file in dico_cluster[clee]:
with open(outfilename, 'wb') as outfile:
for filename in filenames:
if filename == outfilename:
# don't want to copy the output into the output
continue
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
But it's not working. I'm just concatenating all the files.
I want to cat the file that are in the same cluster.
I would recommend to use os package, it's easier to use.
If I understood your problem I would try to do this by loading the whole content of your files before writing it.
import os
for clee in dico_cluster.keys():
my_clusters =list(set(dico_cluster[clee]))
fname = "_".join(my_clusters)
data = list()
outfilename = os.path.join("/path/to/test/outfiles", fname + ".txt")
for file in filenames:
tmp_dict = dict()
tf_file = os.path.basename(file).split(".")[0]
if tf_file in my_clusters:
with open(file, 'rb') as f1:
data.extend([elm for elm in f1.readlines()])
with open(outfilename, "wb") as _output_file:
for elm in data:
_output_file.write(elm)
I Have files in a directory that have many lines like this
cd /oasis/projects/nsf/ets100/
oconv /oasis/projects/nsf/ets100/rla
I would like to insert the word "sky/" so the second line reads like this
oconv /oasis/projects/nsf/sky/ets100/rla
What is the best way to do this?
I know that the beginning of the code will be something like this:
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
with open(os.path.join(root, file), 'r') as fRead:
if line.startswith("oconv"):
You can do something like this
def replace_line(file):
with open(os.path.abspath(file), 'r') as file_obj:
data = file_obj.readlines()
for index, lines in enumerate(data):
if lines.startswith('oconv'):
data[index] = lines.replace('/nsf/', '/nsf/sky/')
with open(os.path.abspath(file), 'w') as file_obj:
file_obj.writelines(data)
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
replace_line(file)
You can use regex to update the string:
import os
import re
path = r"c:\test"
for root, subFolders, files in os.walk('test'):
for file in files:
with open(os.path.join(root, file), 'r+') as fh: #open the file with 'r+' to read and write
content = fh.read()
new_content = re.sub(r'oconv \/oasis\/projects\/nsf\/(.*)', r'oconv /oasis/projects/nsf/sky/\1', content) #replace strings in file
fh.seek(0)
fh.truncate()
fh.write(new_content)