Converting multiple text files to a csv to create a labelled dataset - python

I have text files in multiple folders(folder names are the names of categories/labels). I want to generate a csv file(dataset) that also has a column as the label(folder name) of that category of text.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(folder+"/"+f+"/")
print(file_list)
for file in file_list:
with open(file, "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)
But I'm getting
File "/home/jaideep/Desktop/folder/ML DS/Csv/Main.py", line 15, in <module>
with open(file, "r") as infile:
FileNotFoundError: [Errno 2] No such file or directory: 'file2.txt'
I know there are similar questions previously asked, but I couldn't file solution to my issue. Any help would be appreciated, thanks.

os.listdir only lists the filenames of a directory, so you need to reconstruct the path.
You may want to check out glob for that matter.
This version should solve your issue.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(os.path.join(folder, f))
print(file_list)
for file in file_list:
with open(os.path.join(folder, f, file), "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)

Related

Go through files in given directory with python, read each file line by line and remove first and last string in the line and save updated file

So I have some .txt files inside of directory. Each .txt file contains some paths like:
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c'
'C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c'
I need just some small function that will go through each line of each file inside of a dir and remove there ', so only clear path is left like:
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module2.c
C:\d\folder\project\folder\Folder1\Folder2\Folder3\Module3.c
My code at the moment is:
for filename in files:
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
if line.startswith('')and line.endswith(''):
remove('')
Please assist!
SOLUTION:
I have managed to find a solution with a bit different approach:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks!
python has a hirarchy to strings ', ", "" and so on so you can wrap a uptick into quotes for a split. Since we have the first element '' before the tick the second is your path
line.split("'")[1]
Edit: If i understood you correctly you want this
for filename in files:
paths = []
with open(filename, 'r') as file:
content = file.read().split('\n')
for line in content:
paths.append(line.split("'")[1])
file.close()
with open(filename, 'w') as file:
file.writelines(paths)
file.close()
Soo I just did bit different approach and managed to find a solution:
for filename in files:
f = open(filename, 'rt')
filedata = f.read()
filedata = filedata.replace("'","")
f.close()
f = open(filename, 'wt')
f.write(filedata)
f.close()
Thanks guys anyway!

How to get number only values from a specific row from different text file

I am trying to get only numbers from a particular row from 10 different text files. As an output, I want those numbers appended as a list. I'm a new learner. I would appreciate your help.
tried this one but not working
import os
import sys,re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
#print (filename)
file = open(filename)
all_lines_variable = file.readlines()
sys.stdout = open("output", "a") #print output file
print(filename, all_lines_variable[line_number])
sys.stdout.close()
You can try this script, it will extract from all files line number 69 and then appends it to output.txt file:
import os
import re
line_number=69
path = r'C:\Users\arpan\OneDrive\Desktop\New folder'
with open('output.txt', 'w') as f_out:
for file in os.listdir(path):
with open(os.path.join(path, file), 'r') as f_in:
lines = f_in.readlines()
print(' '.join(re.findall(r'\d+', lines[line_number])), file=f_out)

Table containing micro-averaged and macro-averaged F1-scores

I have a folder that contains 5 different folders, where each folder has 50 email documents belonging to a particular topic (so, there are a total of 5 topics/classes).
Train two classifiers – a Decision Tree, and an SVC (with linear kernel). Report microaveraged and macro-averaged F1-scores of 10-fold cross-validation. You may need to preprocess the data, prune the decision tree, and find good C values for SVC
Can you help me with Table containing micro-averaged and macro-averaged F1-scores.
I tried putting the mails per folder in one txt file but still when I doing the decision tree the process is not letting me do it.
unable to the get the results..
Should I put all the files in a said folder into one text file?
with open ("C:/Users/*******/DS Assign/toclassify/cwx.txt", "w") as outfile:
for f in files:
with open(f) as infile:
for line in infile:
outfile.write(line)
path = ("C:/Users/*******/DS Assign/toclassify/ra/*")
files = glob.glob(path)
#print(files)
with open ("C:/Users/*******/DS Assign/toclassify/ra.txt", "w") as outfile:
for f in files:
with open(f) as infile:
for line in infile:
outfile.write(line)
path = ("C:/Users/*******/DS Assign/toclassify/rsh/*")
files = glob.glob(path)
#print(files)
with open ("C:/Users/*******/DS Assign/toclassify/rsh.txt", "w") as outfile:
for f in files:
with open(f) as infile:
for line in infile:
outfile.write(line)
path = ("C:/Users/*******/DS Assign/toclassify/src/*")
files = glob.glob(path)
#print(files)
with open ("C:/Users/*******/DS Assign/toclassify/src.txt", "w") as outfile:
for f in files:
with open(f) as infile:
for line in infile:
outfile.write(line)
path = ("C:/Users/*******/DS Assign/toclassify/tpm/*")
files = glob.glob(path)
#print(files)
import os
import pandas as pd
data_dir = os.path.join('.', 'data')
data_ids = []
data_txt = []
# Create a helper function to read the data from a particular folder and file
def get_data(file_name, folder_dir):
file_path = os.path.join(folder_dir, file_name)
return open(file_path, 'r').read()
# Loop through each folder in the data directory
for folder in os.listdir(data_dir):
# Create the folder directory from the data directory
folder_dir = os.path.join(data_dir, folder)
# Store the IDs of each file in the particular folder directory into a list
data_ids += os.listdir(folder_dir)
# Using list comprehension to create a list of the text contained in each file
# for a particular ID in the folder directory
data_txt += [get_data(data_id, folder_dir) for data_id in os.listdir(folder_dir)]
# Store into a Pandas dataframe for easy integration into modelling packages
df = pd.DataFrame({
'id': data_ids,
'text': data_txt
})

Merging all csvs in a folder and adding a new column with filename of original file in Python

I am trying to merge all the csv files in a folder into one large csv file. I also need to add a new column to this merged csv that shows the original file that each row came from. This is the code I have so far:
import csv
import glob
read_files = glob.glob("*.csv")
source = []
with open("combined.files.csv", "wb") as outfile:
for f in read_files:
source.append(f)
with open(f, "rb") as infile:
outfile.write(infile.read())
I know I have to somehow repeat each f for as many rows as are in each csv and then append that as a new column to the .write command, but I am not sure how to do this. Thank you everyone!
If you add the filename as the final column, you don't need to parse the csv at all. Just read them line by line, add filename and write. And don't open in binary mode!
import glob
import os
out_filename = "combined.files.csv"
if os.path.exists(out_filename):
os.remove(out_filename)
read_files = glob.glob("*.csv")
with open(out_filename, "w") as outfile:
for filename in read_files:
with open(filename) as infile:
for line in infile:
outfile.write('{},{}\n'.format(line.strip(), filename))
If your csv's have a common header line, pick one to write to the outfile and supress the rest
import os
import glob
want_header = True
out_filename = "combined.files.csv"
if os.path.exists(out_filename):
os.remove(out_filename)
read_files = glob.glob("*.csv")
with open(out_filename, "w") as outfile:
for filename in read_files:
with open(filename) as infile:
if want_header:
outfile.write('{},Filename\n'.format(next(infile).strip()))
want_header = False
else:
next(infile)
for line in infile:
outfile.write('{},{}\n'.format(line.strip(), filename))

batch process text to csv using python

I need some help with converting a number of text files to csv files. All my text files are in one folder and I want to convert them to csv files into another folder. The names of individual files should remain the same. Below is the script I got so far...converting an individual file works fine but to work on all the files within a folder is where I am stuck. Any help will be appreciated.
import csv
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder")
txt_files = directory
csv_files = output
try:
for txt_file in txt_files:
in_txt = csv.reader(open(txt_file, "rb"), delimiter = '=')
for csv_file in csv_files:
out_csv = csv.writer(open(csv_file, 'wb'))
out_csv.writerows(in_txt)
except:
print ()
glob.glob() is perfectly suited for the task. Also, use with context manager when working with files:
import csv
import glob
import os
directory = raw_input("INPUT Folder:")
output = raw_input("OUTPUT Folder:")
txt_files = os.path.join(directory, '*.txt')
for txt_file in glob.glob(txt_files):
with open(txt_file, "rb") as input_file:
in_txt = csv.reader(input_file, delimiter='=')
filename = os.path.splitext(os.path.basename(txt_file))[0] + '.csv'
with open(os.path.join(output, filename), 'wb') as output_file:
out_csv = csv.writer(output_file)
out_csv.writerows(in_txt)

Categories