Running script on every text file in folder - python

I'm looking to run my script on all text files in a directory, but I'm having a lot of trouble.
Here is the code I have so far:
data = {}
date = ID = values = None
infile = "z140327b.txt"
outfile = "oz140327b.txt"
sample = 1
with open(infile) as datafile, open(outfile, 'w') as f2:
for line in datafile:
if line.lstrip().startswith('!'):
date = line[1:].strip()
elif line.lstrip().startswith('?'):
sample = 2
elif line.lstrip().startswith('#'):
ID = line[1:].strip()
data[ID] = {}
data[ID]['date'] = date
tedtime = ID[0:2] + ":" + ID[2:]
str_1 = str(data[ID])
f2.write(tedtime + ' ' + date + ',' + str(sample))
elif line.strip():
if not ID:
continue
try:
words = line.split()
value = float(words[-1]) # last word
unit = words[-2].lstrip('(').rstrip(')')
item = {'value': value, 'unit': unit}
key = ' '.join(words[:-2])
data[ID][key] = item
except (ValueError) as err:
print("Could not parse this line:")
print(line)
continue
else: # if 'empty' line
ca_str = str(data[ID]['Contact Angle']['value'])
f2.write(',' + ca_str + '\n')
ID = None
ca_str2 = str(data[ID]['Contact Angle']['value'])
f2.write(',' + ca_str2 + '\n')
At the minute, I'm manually adding the filename (infile) and the output filename (outfile). I would like the output file name to be the same as the input file, preceded by an 'o', as shown in the example code.

You can use glob to get all the files in the directory:
from glob import glob
files=glob('*.txt')
for filename in files:
with open(filename,'r') as f, open('o'+filename,'w') as f1:
....
#read from f
#write to f1
Simply iterate over each filename, do what you want to it, and write it to a new file.
Make sure your script is run from the directory you are in, or you need to pass the path to glob.

import glob
import os.path
def text_files(target_dir):
"""Return (infile, outfile) tuple for all *.txt files in target_dir."""
text_files = os.path.join(target_dir, '*.txt')
for fname in glob.glob(text_files):
outfile = 'o' + os.path.basename(fname)
outfile = os.path.join(target_dir, outfile)
yield fname, outfile
# Search for text files in /tmp
for inf, outf in text_files("/tmp"):
print inf, outf

Related

Python function within a loop iterating over text files only works on the first file

I'm writing a simple script which loops over some text file and uses a function which should replace some string looking in a .csv file (every row has the word to replace and the word which I want there)
Here is my simple code:
import os
import re
import csv
def substitute_tips(table, tree_content):
count = 0
for l in table:
print("element of the table", l[1])
reg_tree = re.search(l[1],tree_content)
if reg_tree is not None:
#print("match in the tree: ",reg_tree.group())
tree_content = tree_content.replace(reg_tree.group(), l[0])
count = count + 1
else:
print("Not found: ",l[1])
tree_content = tree_content
print("Substitutions done: ",count)
return(tree_content)
path=os.getcwd()
table_name = "162_table.csv"
table = open(table_name)
csv_table = csv.reader(table, delimiter='\t')
for root, dirs, files in os.walk(path, topdown=True):
for name in files:
if name.endswith(".tree"):
print(Fore.GREEN + "Working on treefile", name)
my_tree = open(name, "r")
my_tree_content = my_tree.read()
output_tree = substitute_tips(csv_table, my_tree_content)
output_file = open(name.rstrip("tree") + "SPECIES_NAME.tre", "w")
output_file.write(output_tree)
output_file.close()
else:
print(Fore.YELLOW + name ,Fore.RED + "doesn't end in .tree")
It's probably very easy, but I'm a newbie.
Thanks!
The files list returned by os.walk contains only the file names rather than the full path names. You should join root with the file names instead to be able to open them:
Change:
my_tree = open(name, "r")
...
output_file = open(name.rstrip("tree") + "SPECIES_NAME.tre", "w")
to:
my_tree = open(os.path.join(root, name), "r")
...
output_file = open(os.path.join(root, name.rstrip("tree") + "SPECIES_NAME.tre"), "w")

Write to file line by line Python

Here, I want to write the word_count in each loop line by line to the file. However, they are written all back to back.
import os
import string
def remove_punctuation(value):
result = ""
for c in value:
# If char is not punctuation, add it to the result.
if c not in string.punctuation and c != '،' and c != '؟' and c ! = '؛' and c != '«' and c != '»':
result += c
return result
def all_words(file_path):
with open(file_path, 'r', encoding = "utf-8") as f:
p = f.read()
p = remove_punctuation(p)
words = p.split()
word_count = len(words)
return str(word_count)
myfile = open('D:/t.txt', 'w')
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
f = all_words(file_path)
myfile.write(f)
break
myfile.close()
I have also tried to add newline, but instead, it writes nothing.
myfile.write(f'\n')
Change this line:
return str(word_count)
to
return str(word_count) + '\n'
If you're using python 3.6+, you could also try:
return f'{word_count}\n'
You can write a newline character at the end of each iteration:
for root, dirs, files in os.walk("C:/ZebRa", topdown= False):
for filename in files:
file_path = os.path.join(root, filename)
f = all_words(file_path)
myfile.write(f)
break
myfile.write('\n')
When you us file.write() try using this instead:
myfile.write(f+"\n")
This will add a new line after every iteration
For your code to work, however, you need to iterate in a for loop, like this:
for string in f:
file.write(string+"\n")
I hope this helps

Compare multiple text files, and save commons values

My actual code :
import os, os.path
DIR_DAT = "dat"
DIR_OUTPUT = "output"
filenames = []
#in case if output folder doesn't exist
if not os.path.exists(DIR_OUTPUT):
os.makedirs(DIR_OUTPUT)
#isolating empty values from differents contracts
for roots, dir, files in os.walk(DIR_DAT):
for filename in files:
filenames.append("output/" + os.path.splitext(filename)[0] + ".txt")
filename_input = DIR_DAT + "/" + filename
filename_output = DIR_OUTPUT + "/" + os.path.splitext(filename)[0] + ".txt"
with open(filename_input) as infile, open(filename_output, "w") as outfile:
for line in infile:
if not line.strip().split("=")[-1]:
outfile.write(line)
#creating a single file from all contracts, nb the values are those that are actually empty
with open(DIR_OUTPUT + "/all_agreements.txt", "w") as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
#finale file with commons empty data
#creating a single file
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
line_lower = line.lower()
if line_lower in seen:
outfile.write(line)
else:
seen.add(line_lower)
print("Psst go check in the ouptut folder ;)")
The last lines of my code are checking wether or not, element exists mutliple times. So, may the element exists, once, twice, three, four times. It will add it to results.txt.
But the thing is that I want to save it into results.txt only if it exists 4 times in results.txt.
Or best scenario, compare the 4 .txt files and save elements in commons into results.txt.
But I can't solve it..
Thanks for the help :)
To make it easier,
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
if line in seen:
outfile.write(line)
else:
seen.add(line)
Where can I use the .count() function ?
Because I want to do something like xxx.count(line) == 4 then save it into resulsts.txt
If your files are not super big you can use set.intersection(a,b,c,d).
data = []
for fname in filenames:
current = set()
with open(fname) as infile:
for line in infile:
current.add(line)
data.append(current)
results = set.intersection(*data)
You also don't need to create one single big file for this issue.
Not sure how your input looks like or what output is expected...
But maybe this can spark some ideas:
from io import StringIO
from collections import Counter
lines = ["""\
a=This
b=is
c=a Test
""", """\
a=This
b=is
c=a Demonstration
""", """\
a=This
b=is
c=another
d=example
""", """\
a=This
b=is
c=so much
d=fun
"""]
files = (StringIO(l) for l in lines)
C = Counter(line for f in files for line in f)
print([k for k,v in C.items() if v >= 4])
# Output: ['a=This\n', 'b=is\n']

Python 3.4 - Add file name + line number to all files in a folder

I'm not a programmer and I've been doing my best to create some small scripts with Python 3.4 that help me with different tasks at work.
I have several .txt files and I to every line in the file I would need to append:
the file name
the file name+ line number
save it as a UTF-8 csv with all fields separated by commas.
I managed to do this for one particular file, but I'm struggling to do it for all the files in the folder. I've tried import glob but with no success.
This is the code right now (a mess... that partially works):
with open('Ruth.txt', 'r') as program:
data = program.readlines()
with open('Ruth.txt', 'w') as program:
for (number, line) in enumerate(data):
program.write('%d","%s' % (number + 1, line))
files = 'Ruth.txt'
all_lines = []
for f in files.split():
lines = open(f, 'r').readlines()
for line in lines:
all_lines.append('"' + f + '"' + ',' + '"' + f + line.strip() + '"')
fout = open(f + 'out.csv', 'w')
fout.write('\n'.join(all_lines))
fout.close()
Try this:
import os
def add_numbers(filename):
with open(filename, 'r') as readfile:
data = readfile.readlines()
with open(filename, 'w') as writefile:
for i, line in enumerate(data):
writefile.write('%d. %s' % (i + 1, line))
for path, _, filenames in os.walk(folder):
for filename in filenames:
add_numbers(os.path.join(path, filename))
This will add numbers to each file in the directory and each file in all sub-directories. If you don't want it to check all sub-directories, change the for loop to this:
path, _, filenames = next(os.walk(folder))
for filename in filenames:
add_numbers(os.path.join(path, filename))
here the complete script that take one positional argument (folder) and create a new .csv file at the same level than the file.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
def get_files(folder_path, suffix=".txt"):
return Path(folder_path).glob("**/*%s" % suffix)
def write_lines(file_):
with file_.with_suffix(".csv").open("w") as fout, file_.open(encoding="utf-8") as fin:
for i, line in enumerate(fin, 1):
# line number, file name, line
new_line = ",".join(["%d." % i, file_.name, line])
fout.write(new_line)
def main(folder):
for file_ in get_files(folder):
print(file_)
write_lines(file_)
if __name__ == '__main__':
try:
main(sys.argv[1])
except IndexError:
print("usage: %s foldername" % sys.argv[0])
This will take all text files in current folder and turn them into utf-8 encoded 'csv-style' files so that space in the text is turned into a comma with filename and line number also comma-separated.
from glob import glob
filenames = glob("*.txt")
text = ''
for fn in filenames:
with open(fn,'r') as f:
for i,line in enumerate(f):
line=','.join(line.split())
text += ','.join((line,fn,i+1)) + '\n'
fnew = fn.rsplit('.',1)[0]+'.csv'
with open(fnew,'w', encoding='utf-8') as f:
f.write(text)

Python: Issue when trying to read and write multiple files

This script reads and writes all the individual html files in a directory. The script reiterates, highlight and write the output.The issue is, after highlighting the last instance of the search item, the script removes all the remaining contents after the last search instance in the output of each file. Any help here is appreciated.
import os
import sys
import re
source = raw_input("Enter the source files path:")
listfiles = os.listdir(source)
for f in listfiles:
filepath = os.path.join(source+'\\'+f)
infile = open(filepath, 'r+')
source_content = infile.read()
color = ('red')
regex = re.compile(r"(\b in \b)|(\b be \b)|(\b by \b)|(\b user \b)|(\bmay\b)|(\bmight\b)|(\bwill\b)|(\b's\b)|(\bdon't\b)|(\bdoesn't\b)|(\bwon't\b)|(\bsupport\b)|(\bcan't\b)|(\bkill\b)|(\betc\b)|(\b NA \b)|(\bfollow\b)|(\bhang\b)|(\bbelow\b)", re.I)
i = 0; output = ""
for m in regex.finditer(source_content):
output += "".join([source_content[i:m.start()],
"<strong><span style='color:%s'>" % color[0:],
source_content[m.start():m.end()],
"</span></strong>"])
i = m.end()
outfile = open(filepath, 'w')
outfile.seek(0, 2)
outfile.write(output)
print "\nProcess Completed!\n"
infile.close()
outfile.close()
raw_input()
After your for loop is over, you need to include whatever is left after the last match:
...
i = m.end()
output += source_content[i:]) # Here's the end of your file
outfile = open(filepath, 'w')
...

Categories