batch process text to csv using python

batch process text to csv using python - python

I need some help with converting a number of text files to csv files. All my text files are in one folder and I want to convert them to csv files into another folder. The names of individual files should remain the same. Below is the script I got so far...converting an individual file works fine but to work on all the files within a folder is where I am stuck. Any help will be appreciated.
import csv
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder")
txt_files = directory
csv_files = output
try:
for txt_file in txt_files:
in_txt = csv.reader(open(txt_file, "rb"), delimiter = '=')
for csv_file in csv_files:
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
except:
print ()

glob.glob() is perfectly suited for the task. Also, use with context manager when working with files:
import csv
import glob
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter='=')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)

Related

Converting multiple text files to a csv to create a labelled dataset

I have text files in multiple folders(folder names are the names of categories/labels). I want to generate a csv file(dataset) that also has a column as the label(folder name) of that category of text.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(folder+"/"+f+"/")
print(file_list)
for file in file_list:
with open(file, "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)
But I'm getting
File "/home/jaideep/Desktop/folder/ML DS/Csv/Main.py", line 15, in <module>
with open(file, "r") as infile:
FileNotFoundError: [Errno 2] No such file or directory: 'file2.txt'
I know there are similar questions previously asked, but I couldn't file solution to my issue. Any help would be appreciated, thanks.

os.listdir only lists the filenames of a directory, so you need to reconstruct the path.
You may want to check out glob for that matter.
This version should solve your issue.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(os.path.join(folder, f))
print(file_list)
for file in file_list:
with open(os.path.join(folder, f, file), "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)

Concatenate all files that map values in the same key

I have a dictionnary that group different pattern :
dico_cluster={'cluster_1': ['CUX2', 'CUX1'], 'cluster_2': ['RFX3', 'RFX2'],'cluster_3': ['REST']}
Then I have files in a folder :
"/path/to/test/files/CUX1.txt"
"/path/to/test/files/CUX2.txt"
"/path/to/test/files/RFX3.txt"
"/path/to/test/files/RFX2.txt"
"/path/to/test/files/REST.txt"
"/path/to/test/files/ZEB.txt"
"/path/to/test/files/TEST.txt"
I'm trying to concatenate the files that are in the same cluster. The output file name should be the name of pattern join by underscore "_"
I tried this :
filenames = glob.glob('/path/to/test/files/*.txt')
for clee in dico_cluster.keys():
fname='_'.join(dico_cluster[clee])
outfilename ='/path/to/test/outfiles/'+ fname + ".txt"
for file in filenames:
tf_file=file.split('/')[-1].split('.')[0]
if tf_file in dico_cluster[clee]:
with open(outfilename, 'wb') as outfile:
for filename in filenames:
if filename == outfilename:
# don't want to copy the output into the output
continue
with open(filename, 'rb') as readfile:
shutil.copyfileobj(readfile, outfile)
But it's not working. I'm just concatenating all the files.
I want to cat the file that are in the same cluster.

I would recommend to use os package, it's easier to use.
If I understood your problem I would try to do this by loading the whole content of your files before writing it.
import os
for clee in dico_cluster.keys():
my_clusters =list(set(dico_cluster[clee]))
fname = "_".join(my_clusters)
data = list()
outfilename = os.path.join("/path/to/test/outfiles", fname + ".txt")
for file in filenames:
tmp_dict = dict()
tf_file = os.path.basename(file).split(".")[0]
if tf_file in my_clusters:
with open(file, 'rb') as f1:
data.extend([elm for elm in f1.readlines()])
with open(outfilename, "wb") as _output_file:
for elm in data:
_output_file.write(elm)

How to get number only values from a specific row from different text file

I am trying to get only numbers from a particular row from 10 different text files. As an output, I want those numbers appended as a list. I'm a new learner. I would appreciate your help.
tried this one but not working
import os
import sys,re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
#print (filename)
file = open(filename)
all_lines_variable = file.readlines()
sys.stdout = open("output", "a") #print output file
print(filename, all_lines_variable[line_number])
sys.stdout.close()

You can try this script, it will extract from all files line number 69 and then appends it to output.txt file:
import os
import re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
with open('output.txt', 'w') as f_out:
for file in os.listdir(path):
with open(os.path.join(path, file), 'r') as f_in:
lines = f_in.readlines()
print(' '.join(re.findall(r'\d+', lines[line_number])), file=f_out)

How to rename file names using text file in python

I need to rename a bunch of files in a folder with new name reference from a text file. Can you please give an example for this.
My New Names In a Text file:
1BA
1BB
1BC
1BD
1BE
1BF
1C0
1C1
1C2
1C3
Like this.
Updated Code:
import csv
import os
with open('names.txt') as f2:
filedata = f2.read().split(",")
os.rename(filedata[0].strip(), filedata[1].strip())
f2.close()
f2 = open ('Lines.txt','w')
f2.write(filedata)
f2.close()

What about using a CSV (comma separated) file for input in the format oldPath, newPath and do the following:
import csv
import os
with open('names.csv') as csvfile:
reader = csv.reader(csvfile)
for row in reader:
oldPath = row[0]
newPath = row[1]
os.rename(oldPath, newPath)
Alternatively, if you want to move the file to another directory/filesystem you can have a look at shutil.move

# Create old.txt and rename.txt
# Do not include file path inside the txt files
# For each line, write a filename include the extension
from pathlib import Path
import os
import sys
print("enter path of folder")
path = input()
oldList = []
with open(path + "\\old.txt", "r", encoding="utf8", errors='ignore') as readtxt:
for f in readtxt:
fname = f.rstrip()
oldList.append(fname)
# i is index of oldList
i = 0
newList = []
with open(path + "\\rename.txt", "r", encoding="utf8", errors='ignore') as readtxt:
for f in readtxt:
newName = f.rstrip()
os.rename(path + "\\" + oldList[i], path + "\\" + newName)
i += 1

How to output in different directory?

I have this:
from os import path
base_path = "C:\\texts\\*.txt"
for file in files:
with open (file) as in_file, open(path.join(base_path,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")
It produced tokenized files in the same directory it reads from. How can I output those out_files in different directory such as "C:\\texts\\Tokenized" ?
I know there are some ways to move those new files to other directory after producing them, but what I wanna know is that if there is anyway to output new files to other directory at the same time they are produced in above code?

Is this what you're looking for:
import os
import glob
source_pattern = 'c:/texts/*.txt'
output_directory = 'c:/texts/tokenized'
# Iterate over files matching source_pattern
for input_file in glob.glob(source_pattern):
# build the output filename
base,ext = os.path.splitext(os.path.basename(input_file))
output_file = os.path.join(output_directory,base + '_tokenized' + ext)
with open(input_file) as in_file, open(output_file,'w') as out_file:
for line in in_file:
out_file.write(','.join(line.split()) + '\n')

This is how I output to files in arbitrary directories :
dir_name = "../some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + '/out.txt'
out_file = open( out_file_name, 'w')
EDIT :
file_name = "{0}_tokenized.txt".format(something_from_tokenizing)
if not os.path.exists(dir_name) : os.makedirs(dir_name)
out_file_name = dir_name + file_name
EDIT :
I just tried it, worked for me. You simply need two paths, one for the source directory and one for the destination. Hope this helps.
import os
from os import path
f1 = open("in.txt")
f2 = open("out.txt")
files = ["in.txt", "out.txt"]
base_path = "."
dir_name = "./some_dir"
if not os.path.exists(dir_name) : os.makedirs(dir_name)
for file in files:
with open (file) as in_file, open(path.join(dir_name,"%s_tokenized.txt" % file), "w") as out_file:
data = in_file.readlines()
for line in data:
words = line.split()
str1 = ','.join(words)
out_file.write(str1)
out_file.write("\n")

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

batch process text to csv using python - python

Related

Converting multiple text files to a csv to create a labelled dataset

Concatenate all files that map values in the same key

How to get number only values from a specific row from different text file

How to rename file names using text file in python

How to output in different directory?

Categories

Resources