turning multiple nested files into a single file (concatenating them)

turning multiple nested files into a single file (concatenating them) - python

I have a folder consisting of 7 files, each having several text files inside. I intend to read through them and write each of those nested text files into a single file called ZebraAllRaw.txt. In the end, there must be only one single file containing all the text files that existed in each of those 7 files.
This is the function I have written:
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'a', encoding="utf-8") as OutFile:
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")
However, it seems that all the content is written into the new file 9 times, as if it had read through them more than expected.

make sure you con't append the files from different runs.
I only replaced the file mode append with write at the open
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'w', encoding="utf-8") as OutFile: # mode "w", not "a"
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")

Related

Go through files in given directory with python, read each file line by line and remove first and last string in the line and save updated file

So I have some .txt files inside of directory. Each .txt file contains some paths like:
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c'
I need just some small function that will go through each line of each file inside of a dir and remove there ', so only clear path is left like:
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c
My code at the moment is:
for filename in files:
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
if line.startswith('')and line.endswith(''):
remove('')
Please assist!
SOLUTION:
I have managed to find a solution with a bit different approach:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks!

python has a hirarchy to strings ', ", "" and so on so you can wrap a uptick into quotes for a split. Since we have the first element '' before the tick the second is your path
line.split("'")[1]
Edit: If i understood you correctly you want this
for filename in files:
paths = []
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
paths.append(line.split("'")[1])
file.close()
with open(filename, 'w') as file:
file.writelines(paths)
file.close()

Soo I just did bit different approach and managed to find a solution:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks guys anyway!

How to combine the texts in subfolders with Python

I have a folder containing 4 subfolders and want to combine the texts in each of the subfolders (In others words, the production should be 4 combined texts respectively but not a whole text unifying all the subfolders).
The folder is like this
I want to use os.walk but have no result.
the code is below:
import os
rootdir=r'xxx\xxx\xxx'
allfiles = []
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir+d + 'combined.txt', 'w', encoding='utf-8')
for file in files:
allfiles.append(os.path.join(root, file))
for i in allfiles:
if i.endswith(r'.txt'):
f_in=open(i, 'r', encoding='utf-8')
for line in f_in.readlines():
f_out.write(line)
f_in.close()
f_out.close()

When you walk (through a storm)...
You'll need to log the files to the directory they were found in - as it is, when you get to your f_out.write() line, your way out of sync with what was happening on your reading loop.
Try this:
allFiles2 = {}
for root, dirs, files in os.walk(rootdir):
for d in dirs:
f_out = open(rootdir + d + 'combined.txt', 'w', encoding='utf-8')
allFiles2[os.path.join(root,d)] = []
for name in files:
fullName = os.path.join(root, name)
#At this point, we want to send the file to the correct dictionary position, so use the path defining the dictionary as a search item
for key in allFiles2:
if key == fullName[:len(key)]:
allFiles2[key].append(os.path.join(root, name))
for i1 in allFiles2: #Looping through the keys (directories) in allFiles2
with open(i1 + 'combined.txt', 'w', encoding='utf-8') as f_out:
for i2 in allFiles2[i1]: #Looping through the files in that directory
if i2.endswith('.txt'):
with open(i2, 'r') as f_in:
for line in f_in.readlines():
f_out.write(line)
f_out.write("\n")
Now, I'm not sure what shape you hope the output text files to be - but hopefully thats a useful starting point for you.

Creating new files through loop using python3

I have a Dataset, which has 5 folders, in which each folder has 100 .txt files. Below code you can see that I am looping through every file, and removing certain words from those files using my StopWords.txt file.
After I remove the words I am appending the output in one file(filteredtext.txt). But I want to have these output exactly as my Dataset (5 folders which has 100 .txt file).
This is my code.
import re
import os
#insert stopwords files
stopwordfile = open("StopWords.txt", encoding='utf-8')
# Use this to read file content as a stream:
readstopword = stopwordfile.read()
stop_words = readstopword.split()
#file path to dataset
for path, _, files in os.walk("sinhala-set1"):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
file1 = open(filepath, encoding='utf-8')
# Use this to read file content as a stream:
line = file1.read()
words = line.split()
for r in words:
if not r in stop_words:
appendFile = open('filteredtext.txt','a', encoding='utf-8')
appendFile.write(" "+r)
appendFile.close()

You are appending the file because you are opening the same .txt file with appending mode appendFile = open('filteredtext.txt','a', encoding='utf-8') If you want a separate file for each loop, open a different file like this:
output_file = open('output_' + file_name), 'w', encoding='utf-8')

Concatenate all files that map values in the same key

I have a dictionnary that group different pattern :
dico_cluster={'cluster_1': ['CUX2', 'CUX1'], 'cluster_2': ['RFX3', 'RFX2'],'cluster_3': ['REST']}
Then I have files in a folder :
"/path/to/test/files/CUX1.txt"
"/path/to/test/files/CUX2.txt"
"/path/to/test/files/RFX3.txt"
"/path/to/test/files/RFX2.txt"
"/path/to/test/files/REST.txt"
"/path/to/test/files/ZEB.txt"
"/path/to/test/files/TEST.txt"
I'm trying to concatenate the files that are in the same cluster. The output file name should be the name of pattern join by underscore "_"
I tried this :
filenames = glob.glob('/path/to/test/files/*.txt')
for clee in dico_cluster.keys():
fname='_'.join(dico_cluster[clee])
outfilename ='/path/to/test/outfiles/'+ fname + ".txt"
for file in filenames:
tf_file=file.split('/')[-1].split('.')[0]
if tf_file in dico_cluster[clee]:
with open(outfilename, 'wb') as outfile:
for filename in filenames:
if filename == outfilename:
# don't want to copy the output into the output
continue
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
But it's not working. I'm just concatenating all the files.
I want to cat the file that are in the same cluster.

I would recommend to use os package, it's easier to use.
If I understood your problem I would try to do this by loading the whole content of your files before writing it.
import os
for clee in dico_cluster.keys():
my_clusters =list(set(dico_cluster[clee]))
fname = "_".join(my_clusters)
data = list()
outfilename = os.path.join("/path/to/test/outfiles", fname + ".txt")
for file in filenames:
tmp_dict = dict()
tf_file = os.path.basename(file).split(".")[0]
if tf_file in my_clusters:
with open(file, 'rb') as f1:
data.extend([elm for elm in f1.readlines()])
with open(outfilename, "wb") as _output_file:
for elm in data:
_output_file.write(elm)

insert a word in lines and save the files using Python

I Have files in a directory that have many lines like this
cd /oasis/projects/nsf/ets100/
oconv /oasis/projects/nsf/ets100/rla
I would like to insert the word "sky/" so the second line reads like this
oconv /oasis/projects/nsf/sky/ets100/rla
What is the best way to do this?
I know that the beginning of the code will be something like this:
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
with open(os.path.join(root, file), 'r') as fRead:
if line.startswith("oconv"):

You can do something like this
def replace_line(file):
with open(os.path.abspath(file), 'r') as file_obj:
data = file_obj.readlines()
for index, lines in enumerate(data):
if lines.startswith('oconv'):
data[index] = lines.replace('/nsf/', '/nsf/sky/')
with open(os.path.abspath(file), 'w') as file_obj:
file_obj.writelines(data)
path = r"c:\test"
for root, subFolders, files in os.walk(path):
for file in files:
replace_line(file)

You can use regex to update the string:
import os
import re
path = r"c:\test"
for root, subFolders, files in os.walk('test'):
for file in files:
with open(os.path.join(root, file), 'r+') as fh: #open the file with 'r+' to read and write
content = fh.read()
new_content = re.sub(r'oconv \/oasis\/projects\/nsf\/(.*)', r'oconv /oasis/projects/nsf/sky/\1', content) #replace strings in file
fh.seek(0)
fh.truncate()
fh.write(new_content)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

turning multiple nested files into a single file (concatenating them) - python

Related

Go through files in given directory with python, read each file line by line and remove first and last string in the line and save updated file

How to combine the texts in subfolders with Python

Creating new files through loop using python3

Concatenate all files that map values in the same key

insert a word in lines and save the files using Python

Categories

Resources