Concatenate all files that map values in the same key - python

I have a dictionnary that group different pattern :
dico_cluster={'cluster_1': ['CUX2', 'CUX1'], 'cluster_2': ['RFX3', 'RFX2'],'cluster_3': ['REST']}
Then I have files in a folder :
"/path/to/test/files/CUX1.txt"
"/path/to/test/files/CUX2.txt"
"/path/to/test/files/RFX3.txt"
"/path/to/test/files/RFX2.txt"
"/path/to/test/files/REST.txt"
"/path/to/test/files/ZEB.txt"
"/path/to/test/files/TEST.txt"
I'm trying to concatenate the files that are in the same cluster. The output file name should be the name of pattern join by underscore "_"
I tried this :
filenames = glob.glob('/path/to/test/files/*.txt')
for clee in dico_cluster.keys():
fname='_'.join(dico_cluster[clee])
outfilename ='/path/to/test/outfiles/'+ fname + ".txt"
for file in filenames:
tf_file=file.split('/')[-1].split('.')[0]
if tf_file in dico_cluster[clee]:
with open(outfilename, 'wb') as outfile:
for filename in filenames:
if filename == outfilename:
# don't want to copy the output into the output
continue
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
But it's not working. I'm just concatenating all the files.
I want to cat the file that are in the same cluster.

I would recommend to use os package, it's easier to use.
If I understood your problem I would try to do this by loading the whole content of your files before writing it.
import os
for clee in dico_cluster.keys():
my_clusters =list(set(dico_cluster[clee]))
fname = "_".join(my_clusters)
data = list()
outfilename = os.path.join("/path/to/test/outfiles", fname + ".txt")
for file in filenames:
tmp_dict = dict()
tf_file = os.path.basename(file).split(".")[0]
if tf_file in my_clusters:
with open(file, 'rb') as f1:
data.extend([elm for elm in f1.readlines()])
with open(outfilename, "wb") as _output_file:
for elm in data:
_output_file.write(elm)

Related

Converting multiple text files to a csv to create a labelled dataset

I have text files in multiple folders(folder names are the names of categories/labels). I want to generate a csv file(dataset) that also has a column as the label(folder name) of that category of text.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(folder+"/"+f+"/")
print(file_list)
for file in file_list:
with open(file, "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)
But I'm getting
File "/home/jaideep/Desktop/folder/ML DS/Csv/Main.py", line 15, in <module>
with open(file, "r") as infile:
FileNotFoundError: [Errno 2] No such file or directory: 'file2.txt'
I know there are similar questions previously asked, but I couldn't file solution to my issue. Any help would be appreciated, thanks.
os.listdir only lists the filenames of a directory, so you need to reconstruct the path.
You may want to check out glob for that matter.
This version should solve your issue.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(os.path.join(folder, f))
print(file_list)
for file in file_list:
with open(os.path.join(folder, f, file), "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)

How to get number only values from a specific row from different text file

I am trying to get only numbers from a particular row from 10 different text files. As an output, I want those numbers appended as a list. I'm a new learner. I would appreciate your help.
tried this one but not working
import os
import sys,re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
#print (filename)
file = open(filename)
all_lines_variable = file.readlines()
sys.stdout = open("output", "a") #print output file
print(filename, all_lines_variable[line_number])
sys.stdout.close()
You can try this script, it will extract from all files line number 69 and then appends it to output.txt file:
import os
import re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
with open('output.txt', 'w') as f_out:
for file in os.listdir(path):
with open(os.path.join(path, file), 'r') as f_in:
lines = f_in.readlines()
print(' '.join(re.findall(r'\d+', lines[line_number])), file=f_out)

turning multiple nested files into a single file (concatenating them)

I have a folder consisting of 7 files, each having several text files inside. I intend to read through them and write each of those nested text files into a single file called ZebraAllRaw.txt. In the end, there must be only one single file containing all the text files that existed in each of those 7 files.
This is the function I have written:
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'a', encoding="utf-8") as OutFile:
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")
However, it seems that all the content is written into the new file 9 times, as if it had read through them more than expected.
make sure you con't append the files from different runs.
I only replaced the file mode append with write at the open
def CombineFiles(folder):
with open('D:/ZebraAllRaw.txt', 'w', encoding="utf-8") as OutFile: # mode "w", not "a"
for root, dirs, files in os.walk(folder, topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
with open(file_path, 'r', encoding="utf-8") as f:
content = f.read()
new_content = content.replace('\n', '')
OutFile.write(new_content + "\n")

How to output in different directory?

I have this:
from os import path
base_path = "C:\\texts\\*.txt"
for file in files:
with open (file) as in_file, open(path.join(base_path,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")
It produced tokenized files in the same directory it reads from. How can I output those out_files in different directory such as "C:\\texts\\Tokenized" ?
I know there are some ways to move those new files to other directory after producing them, but what I wanna know is that if there is anyway to output new files to other directory at the same time they are produced in above code?
Is this what you're looking for:
import os
import glob
source_pattern = 'c:/texts/*.txt'
output_directory = 'c:/texts/tokenized'
# Iterate over files matching source_pattern
for input_file in glob.glob(source_pattern):
# build the output filename
base,ext = os.path.splitext(os.path.basename(input_file))
output_file = os.path.join(output_directory,base + '_tokenized' + ext)
with open(input_file) as in_file, open(output_file,'w') as out_file:
for line in in_file:
out_file.write(','.join(line.split()) + '\n')
This is how I output to files in arbitrary directories :
dir_name = "../some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + '/out.txt'
out_file = open( out_file_name, 'w')
EDIT :
file_name = "{0}_tokenized.txt".format(something_from_tokenizing)
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + file_name
EDIT :
I just tried it, worked for me. You simply need two paths, one for the source directory and one for the destination. Hope this helps.
import os
from os import path
f1 = open("in.txt")
f2 = open("out.txt")
files = ["in.txt", "out.txt"]
base_path = "."
dir_name = "./some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
for file in files:
with open (file) as in_file, open(path.join(dir_name,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")

batch process text to csv using python

I need some help with converting a number of text files to csv files. All my text files are in one folder and I want to convert them to csv files into another folder. The names of individual files should remain the same. Below is the script I got so far...converting an individual file works fine but to work on all the files within a folder is where I am stuck. Any help will be appreciated.
import csv
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder")
txt_files = directory
csv_files = output
try:
for txt_file in txt_files:
in_txt = csv.reader(open(txt_file, "rb"), delimiter = '=')
for csv_file in csv_files:
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
except:
print ()
glob.glob() is perfectly suited for the task. Also, use with context manager when working with files:
import csv
import glob
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter='=')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)

Categories