Reading folder and writing in .txt - python

I am trying to read all .txt files in a folder and print them in a .csv file. If i have (in theory) unlimited .txt files with (in theory) a lot of stuff in it i want it in a table like Excel.
import os, csv
destinationfile = "meine.csv"
dictionary = {}
directory = os.path.normpath("C:/Users/Viktor/PycharmProjects/pythonProject1")
for subdir, dirs, files in os.walk(directory):
for file in files:
split = file.split(".", 1)
header = split[0]
readfiles = open(file, "r")
tlist = []
for line in readfiles:
split_line = line.split()
for split in split_line:
tlist.append(split)
dictionary.update({header: tlist})
if file.endswith(".txt"):
f=open(os.path.join(subdir, file),'r')
a = f.read()
split = file.split(".", 1)
header = split[0]
readfile = open(file, "r")
tlist = []
for line in readfile:
split_line = line.split()
for split in split_line:
tlist.append(split)
dictionary.update({header: tlist})
with open(destinationfile, "w", newline='') as csv_file:
writer = csv.writer(csv_file,dialect='excel',delimiter=";")
headers = list(dictionary)
writer.writerow(headers)
for entry in zip(*dictionary.values()):
writer.writerow(entry)
It should look something like this in the end:
001;002;003;
a;d;g
b;e;h
c;f;i
001/002/003 are the names of my .txt files and a,b,c,d,e... is the info in my .txt files.
But, when I run the code I get this error:
Traceback (most recent call last):
File "C:\Users\Viktor\PycharmProjects\pythonProject1\main.py", line 11, in <module>
readfiles = open(file, "r")
FileNotFoundError: [Errno 2] No such file or directory: '.gitignore'
Process finished with exit code 1

Related

How to read a line while reading a file in Python

Imagine that another filename in the same directory is inside the txt file we're currently in:
For example, let file A be the following:
B.txt
computer
science
How would it be possible to read the other lines and go into B.txt after we're done reading?
If you want to read first line separately, you can do it with readline(). Loop then proceeds to read the file from the second line to the end of file:
import os
def read_files_to_list(wordlist, file):
with open(file, "r") as f:
newfile = f.readline()
newfile = newfile.strip() # removes \n and whitespaces
if not os.path.exists(newfile):
wordlist.append(newfile)
newfile = None
for line in f:
line_clean = line.strip()
wordlist.append(line_clean)
return wordlist, newfile
next_file = "A.txt"
listofwords = []
while next_file is not None:
listofwords, next_file = read_files_to_list(listofwords, next_file)

Creating new files through loop using python3

I have a Dataset, which has 5 folders, in which each folder has 100 .txt files. Below code you can see that I am looping through every file, and removing certain words from those files using my StopWords.txt file.
After I remove the words I am appending the output in one file(filteredtext.txt). But I want to have these output exactly as my Dataset (5 folders which has 100 .txt file).
This is my code.
import re
import os
#insert stopwords files
stopwordfile = open("StopWords.txt", encoding='utf-8')
# Use this to read file content as a stream:
readstopword = stopwordfile.read()
stop_words = readstopword.split()
#file path to dataset
for path, _, files in os.walk("sinhala-set1"):
for file_name in files:
filepath = os.path.join(path, file_name)
print(f"Checking --> {filepath}")
file1 = open(filepath, encoding='utf-8')
# Use this to read file content as a stream:
line = file1.read()
words = line.split()
for r in words:
if not r in stop_words:
appendFile = open('filteredtext.txt','a', encoding='utf-8')
appendFile.write(" "+r)
appendFile.close()
You are appending the file because you are opening the same .txt file with appending mode appendFile = open('filteredtext.txt','a', encoding='utf-8') If you want a separate file for each loop, open a different file like this:
output_file = open('output_' + file_name), 'w', encoding='utf-8')

Converting multiple text files to a csv to create a labelled dataset

I have text files in multiple folders(folder names are the names of categories/labels). I want to generate a csv file(dataset) that also has a column as the label(folder name) of that category of text.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(folder+"/"+f+"/")
print(file_list)
for file in file_list:
with open(file, "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)
But I'm getting
File "/home/jaideep/Desktop/folder/ML DS/Csv/Main.py", line 15, in <module>
with open(file, "r") as infile:
FileNotFoundError: [Errno 2] No such file or directory: 'file2.txt'
I know there are similar questions previously asked, but I couldn't file solution to my issue. Any help would be appreciated, thanks.
os.listdir only lists the filenames of a directory, so you need to reconstruct the path.
You may want to check out glob for that matter.
This version should solve your issue.
import csv
import os
folder = os.path.dirname("/home/jaideep/Desktop/folder/ML DS/Csv/Datasets/")
folder_list = os.listdir(folder)
with open("/home/jaideep/Desktop/folder/ML DS/Csv/data.csv", "w") as outfile:
writer = csv.writer(outfile)
writer.writerow(['Label', 'Email','Message'])
for f in folder_list:
file_list = os.listdir(os.path.join(folder, f))
print(file_list)
for file in file_list:
with open(os.path.join(folder, f, file), "r") as infile:
contents = infile.read()
outfile.write(f+',')
outfile.write(contents)

Why do I have a 'No such file or directory error' when I have created the file in the block of code?

I have a directory of csv files which I would like to read, extract the required information, and then save this as another csv file in another directory. I have defined blocks of code to complete the process I require, however I have run into a 'No file or directory' error when trying to test one file. As far as I can tell, I am creating the required file within the code, and so I don't understand why the file does not exist.
def writeFormPage(file, path):
'''
Input:
Index CSV
Output:
Page CSV
'''
with open(file, 'r') as rf:
reader = csv.reader(rf)
base_name = os.path.basename(file)
file_path = os.path.join(path, base_name)
with open(file_path, 'w') as wf:
writer = csv.writer(wf, delimiter = ',')
for line in reader:
url = line[-1]
page_data = (parseFormPage(url))
writer.writerow(page_data)
time.sleep(3 + random.random() * 3)
os.chdir(PAGE_DIR)
demo_index_csv = '/Users/alexajones/index/2018Q4.csv'
# testing!!!
writeFormPage(demo_index_csv, PAGE_DIR)
FileNotFoundError Traceback (most recent call last)
<ipython-input-14-81e495544fd6> in <module>
4
5 # testing!!!
----> 6 writeFormPage(demo_index_csv, PAGE_DIR)
<ipython-input-12-ff5447b72e24> in writeFormPage(file, path)
13 file_path = os.path.join(path, base_name)
14
---> 15 with open(file_path, 'w') as wf:
16 writer = csv.writer(wf, delimiter = ',')
17
FileNotFoundError: [Errno 2] No such file or directory: './page/2018Q4.csv'
I am absolutely sure I've made some silly error, as I am an absolute beginner, however I haven't been able to find it so far. I'd like to make sure the code works, as eventually I will be writing a loop to complete the process for all of the csv files in the directory.
Any help will be much appreciated.
In response to answers from Barmar and Martineau; I had created the subdirectory further up in the program, 3 directories at once, though I have been having lots of 'these don't exist' problems with these. After your advice I decided to create each directory as required, as shown below, though I now have a 'doesn't exist' error in a piece of code which was working fine before. What am I doing wrong with these directories? As a side note, I need these to work on someone else's pc so I can't specify the whole path.
# Create a new directory to hold CSV files
indx_dir = './index'
if not os.path.isdir(indx_dir):
os.makedirs(indx_dir)
os.chdir(indx_dir)
#Create new list to store URLs from csv files
sec_urls = []
# For each csv file, open and locate URL in line 4 and add to newly created list
for filename in os.listdir(indx_dir):
if filename.endswith('.csv'):
with open(os.path.join(indx_dir, filename), newline='') as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for line in reader:
url = line[4].strip()
sec_urls.append(url)
print(url, 'downloaded and added to list')
---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-24-281ad84b64c7> in <module>
3
4 # For each csv file, open and locate URL in line 4 and add to newly created list
----> 5 for filename in os.listdir(indx_dir):
6 if filename.endswith('.csv'):
7 with open(os.path.join(indx_dir, filename), newline='') as csvfile:
FileNotFoundError: [Errno 2] No such file or directory: './index'
Thank you to Barmar this issue has now been solved.
The code for the question that I asked originally has been changed to:
# Define a function to collect form page data and save as a new csv file
def writeFormPage(file, path):
'''
Input:
Index CSV
Output:
Page CSV
'''
with open(file, 'r') as rf:
reader = csv.reader(rf)
base_name = os.path.basename(file)
file_path = os.path.join(path, base_name)
with open(file_path, 'w') as wf:
writer = csv.writer(wf, delimiter = ',')
for line in reader:
url = line[-1]
page_data = (parseFormPage(url))
writer.writerow(page_data)
time.sleep(3 + random.random() * 3)
# Create a new directory to save new CSV files
page_dir = './page'
if not os.path.isdir(page_dir):
os.makedirs(page_dir)
os.path.isdir(page_dir)
demo_index_csv = './index/2018Q4.csv'
# testing!!!
writeFormPage(demo_index_csv, page_dir)

How to extract a string from the file list in Python?

I have a folder with a list of 425 similar files named “00001q1.txt, 00002w2.txt, 00003e3.txt... 00425q1.txt”. Each file contains a line of text between two rows. These rows are constant in all files. I need to extract these lines and save it to output file as column of lines.
This is script which able to loop all the files in a folder, but it doesn't extract desired lines from the list of files to otput file.
#!/usr/bin/python
# Open a file
import re
import os
import sys
import glob
outfile = open("list7.txt", "w")
# This would print all the files and directories (in sorted order)
full_path = r"F:\files\list"
filelist = sorted(os.listdir( full_path ))
print filelist
# This would scan the filelist and extract desired line that located between two rovs:
# 00001q1.txt:
# Row above line
# line
# Row under line
buffer = []
for line in filelist:
if line.startswith("Row above line"):
buffer = ['']
elif line.startswith("Row under line"):
outfile.write("".join(buffer))
buffer = []
elif buffer:
buffer.append(line)
# infile.close()
outfile.close()
If I define a single file (for example 00001q1.txt“) instead filelist in the script, then desired line is written to the outfile successfully. What should I do that script scan the list of files?
Thanks in advance.
You need to iterate both files and rows in each file:
buffer = []
for fileName in filelist:
with open(fileName, 'rU') as f:
for line in f:
if line.startswith("Row above line"):
buffer = ['']
elif line.startswith("Row under line"):
outfile.write("".join(buffer))
buffer = []
elif buffer:
buffer.append(line)
If I understand well you want to write to list7.txt all needed occurrences:
import os
outfile = open("list7.txt", "w")
full_path = r"F:\files\list"
filelist = sorted(os.listdir(full_path))
with open("list7.txt", "w") as outfile:
buffer = []
for filename in filelist:
with open(os.path.join(full_path, filename), "r") as infile:
for line in infile.readlines():
if line.startswith("Row above line"):
buffer = ['']
elif line.startswith("Row under line"):
outfile.write("".join(buffer))
buffer = []
elif buffer:
buffer.append(line)
for line in buffer:
outfile.write(line)

Categories