Compare multiple text files, and save commons values

Compare multiple text files, and save commons values - python

My actual code :
import os, os.path
DIR_DAT = "dat"
DIR_OUTPUT = "output"
filenames = []
#in case if output folder doesn't exist
if not os.path.exists(DIR_OUTPUT):
os.makedirs(DIR_OUTPUT)
#isolating empty values from differents contracts
for roots, dir, files in os.walk(DIR_DAT):
for filename in files:
filenames.append("output/" + os.path.splitext(filename)[0] + ".txt")
filename_input = DIR_DAT + "/" + filename
filename_output = DIR_OUTPUT + "/" + os.path.splitext(filename)[0] + ".txt"
with open(filename_input) as infile, open(filename_output, "w") as outfile:
for line in infile:
if not line.strip().split("=")[-1]:
outfile.write(line)
#creating a single file from all contracts, nb the values are those that are actually empty
with open(DIR_OUTPUT + "/all_agreements.txt", "w") as outfile:
for fname in filenames:
with open(fname) as infile:
for line in infile:
outfile.write(line)
#finale file with commons empty data
#creating a single file
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
line_lower = line.lower()
if line_lower in seen:
outfile.write(line)
else:
seen.add(line_lower)
print("Psst go check in the ouptut folder ;)")
The last lines of my code are checking wether or not, element exists mutliple times. So, may the element exists, once, twice, three, four times. It will add it to results.txt.
But the thing is that I want to save it into results.txt only if it exists 4 times in results.txt.
Or best scenario, compare the 4 .txt files and save elements in commons into results.txt.
But I can't solve it..
Thanks for the help :)
To make it easier,
with open(DIR_OUTPUT + "/all_agreements.txt") as infile, open(DIR_OUTPUT + "/results.txt", "w") as outfile:
seen = set()
for line in infile:
if line in seen:
outfile.write(line)
else:
seen.add(line)
Where can I use the .count() function ?
Because I want to do something like xxx.count(line) == 4 then save it into resulsts.txt

If your files are not super big you can use set.intersection(a,b,c,d).
data = []
for fname in filenames:
current = set()
with open(fname) as infile:
for line in infile:
current.add(line)
data.append(current)
results = set.intersection(*data)
You also don't need to create one single big file for this issue.

Not sure how your input looks like or what output is expected...
But maybe this can spark some ideas:
from io import StringIO
from collections import Counter
lines = ["""\
a=This
b=is
c=a Test
""", """\
a=This
b=is
c=a Demonstration
""", """\
a=This
b=is
c=another
d=example
""", """\
a=This
b=is
c=so much
d=fun
"""]
files = (StringIO(l) for l in lines)
C = Counter(line for f in files for line in f)
print([k for k,v in C.items() if v >= 4])
# Output: ['a=This\n', 'b=is\n']

Related

How to loop through each file in a folder, do some action to the file and save output to a file in another folder Python

I have a folder with multiple files like so:
1980
1981
1982
In each of these files is some text. I want to loop through each of these files and do some operation to each file then save the edited file to another folder and move onto the next file and so on. The result would be that I have the original folder and then another folder with the edited version of each file in it like so:
1980_filtered
1981_filtered
1982_filtered
Is it possible to do this?
Currently I have some code that loops through the files in a folder, does some filtering to each file and then saves all the edits of each file into one massive file. Here is my code:
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/mouse'
for root, dir, files in os.walk(input_location):
for file in files:
os.chdir(input_location)
with open(file, 'r') as f, open('NLTK-stop-word-list', 'r') as f2:
mouse_file = f.read().split() # reads file and splits it into a list
stopwords = f2.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
with open(output_location, 'a') as output_file:
output_file.write(x)
Any help would be greatly appreciated!

You need to specify what each new file is called. To do so, Python has some good string formatting methods. Fortunately, your new desired file names are easy to do in a loop
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/mouse'
for root, dir, files in os.walk(input_location):
for file in files:
new_file = "{}_filtered.txt".format(file)
os.chdir(input_location)
with open(file, 'r') as f, open('NLTK-stop-word-list', 'r') as f2:
mouse_file = f.read().split()
stopwords = f2.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
with open(output_location+'/'+new_file, 'w') as output_file: # Changed 'append' to 'write'
output_file.write(x)
If you're in Python 3.7, you can do
new_file = f"{file}_filtered.txt"
and
with open(f"{output_location}/{new_file}", 'w') as output_file:
output_file.write(x)

First of all you should start by opening the NLTK-stop-word-list only once, so I moved it outside of your loops. Second, os.chdir() is redundant, you can use os.path.join() to get your current file path (and to construct your new file path):
import os
input_location = 'C:/Users/User/Desktop/mini_mouse'
output_location = 'C:/Users/User/Desktop/filter_mini_mouse/'
stop_words_path = 'C:/Users/User/Desktop/NLTK-stop-word-list.txt'
with open(stop_words_path, 'r') as stop_words:
for root, dirs, files in os.walk(input_location):
for name in files:
file_path = os.path.join(root, name)
with open(file_path, 'r') as f:
mouse_file = f.read().split() # reads file and splits it into a list
stopwords = stop_words.read().split()
x = (' '.join(i for i in mouse_file if i.lower() not in (x.lower() for x in stopwords)))
new_file_path = os.path.join(output_location, name) + '_filtered'
with open(new_file_path, 'a') as output_file:
output_file.write(x)
P.S: I took the liberty to change some of your variable names as they were part of python's built in words ('file' and 'dir'). If you'll run __builtins__.__dict__.keys() you'll see them there.

Writting several files according to an element of an original file

I need to read a file in bed format that contains coordinates of all chr in a genome, into different files according with the chr. I tried this approach but it doesn't work, it doesn't create any files. Any idees why this happens or alternative approaches to solve this problem?
import sys
def make_out_file(dir_path, chr_name, extension):
file_name = dir_path + "/" + chr_name + extension
out_file = open(file_name, "w")
out_file.close()
return file_name
def append_output_file(line, out_file):
with open(out_file, "a") as f:
f.write(line)
f.close()
in_name = sys.argv[1]
dir_path = sys.argv[2]
with open(in_name, "r") as in_file:
file_content = in_file.readlines()
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in file_content[:0]:
line_count += 1
elems = line.split("\t")
chr_name = elems[0]
chr_dict[chr_name] += 1
if chr_dict.get(chr_name) = 1:
out_file = make_out_file(dir_path, chr_name, ".bed")
out_file_dict[chr_name] = out_file
append_output_file(line, out_file)
elif chr_dict.get(chr_name) > 1:
out_file = out_file_dict.get(chr_name)
append_output_file(line, out_file)
else:
print "There's been an Error"
in_file.close()

This line:
for line in file_content[:0]:
says to iterate over an empty list. The empty list comes from the slice [:0] which says to slice from the beginning of the list to just before the first element. Here's a demonstration:
>>> l = ['line 1\n', 'line 2\n', 'line 3\n']
>>> l[:0]
[]
>>> l[:1]
['line 1\n']
Because the list is empty no iteration takes place, so the code in the body of your for loop in not executed.
To iterate over each line of the file you do not need the slice:
for line in file_content:
However, it is better again to iterate over the file object as this does not require that the whole file be first read into memory:
with open(in_name, "r") as in_file:
chr_dict = {}
out_file_dict = {}
line_count = 0
for line in in_file:
...
Following that there are numerous problems, including syntax errors, with the code in the for loop which you can begin debugging.

Python 3.4 - Add file name + line number to all files in a folder

I'm not a programmer and I've been doing my best to create some small scripts with Python 3.4 that help me with different tasks at work.
I have several .txt files and I to every line in the file I would need to append:
the file name
the file name+ line number
save it as a UTF-8 csv with all fields separated by commas.
I managed to do this for one particular file, but I'm struggling to do it for all the files in the folder. I've tried import glob but with no success.
This is the code right now (a mess... that partially works):
with open('Ruth.txt', 'r') as program:
data = program.readlines()
with open('Ruth.txt', 'w') as program:
for (number, line) in enumerate(data):
program.write('%d","%s' % (number + 1, line))
files = 'Ruth.txt'
all_lines = []
for f in files.split():
lines = open(f, 'r').readlines()
for line in lines:
all_lines.append('"' + f + '"' + ',' + '"' + f + line.strip() + '"')
fout = open(f + 'out.csv', 'w')
fout.write('\n'.join(all_lines))
fout.close()

Try this:
import os
def add_numbers(filename):
with open(filename, 'r') as readfile:
data = readfile.readlines()
with open(filename, 'w') as writefile:
for i, line in enumerate(data):
writefile.write('%d. %s' % (i + 1, line))
for path, _, filenames in os.walk(folder):
for filename in filenames:
add_numbers(os.path.join(path, filename))
This will add numbers to each file in the directory and each file in all sub-directories. If you don't want it to check all sub-directories, change the for loop to this:
path, _, filenames = next(os.walk(folder))
for filename in filenames:
add_numbers(os.path.join(path, filename))

here the complete script that take one positional argument (folder) and create a new .csv file at the same level than the file.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from pathlib import Path
def get_files(folder_path, suffix=".txt"):
return Path(folder_path).glob("**/*%s" % suffix)
def write_lines(file_):
with file_.with_suffix(".csv").open("w") as fout, file_.open(encoding="utf-8") as fin:
for i, line in enumerate(fin, 1):
# line number, file name, line
new_line = ",".join(["%d." % i, file_.name, line])
fout.write(new_line)
def main(folder):
for file_ in get_files(folder):
print(file_)
write_lines(file_)
if __name__ == '__main__':
try:
main(sys.argv[1])
except IndexError:
print("usage: %s foldername" % sys.argv[0])

This will take all text files in current folder and turn them into utf-8 encoded 'csv-style' files so that space in the text is turned into a comma with filename and line number also comma-separated.
from glob import glob
filenames = glob("*.txt")
text = ''
for fn in filenames:
with open(fn,'r') as f:
for i,line in enumerate(f):
line=','.join(line.split())
text += ','.join((line,fn,i+1)) + '\n'
fnew = fn.rsplit('.',1)[0]+'.csv'
with open(fnew,'w', encoding='utf-8') as f:
f.write(text)

How to split text file by id in python

I have a bunch of text files containing tab separated tables. The second column contains an id number, and each file is already sorted by that id number. I want to separate each file into multiple files by the id number in column 2. Here's what I have.
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'r') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) <> lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()
What I get is simply a copy of all the read files with the first id number from each file in front of the new filenames. It is as if
thisid = line.split("\t")[1]
is only done once in the loop. Any clue to what is going on?
EDIT
The problem was my files used \r rather than \r\n to terminate lines. Corrected code (simply adding 'rU' when opening the read file and swapping != for <>):
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(readpath+filename, 'rU') as fh:
lines = fh.readlines()
lastid = 0
f = open(writepath+'checkme.txt', 'w')
f.write(filename)
for line in lines:
thisid = line.split("\t")[1]
if int(thisid) != lastid:
f.close()
f = open(writepath+thisid+'-'+filename,'w')
lastid = int(thisid)
f.write(line)
f.close()

If you're dealing with tab delimited files, then you can use the csv module, and take advantage of the fact that itertools.groupby will do the previous/current tracking of the id for you. Also utilise os.path.join to make sure your filenames end up joining correctly.
Untested:
import os
import csv
from itertools import groupby
readpath = 'path-to-read-file'
writepath = 'path-to-write-file'
for filename in os.listdir(readpath):
with open(os.path.join(readpath, filename)) as fin:
tabin = csv.reader(fin, delimiter='\t')
for file_id, rows in groupby(tabin, lambda L: L[1]):
with open(os.path.join(writepath, file_id + '-' + filename), 'w') as fout:
tabout = csv.writer(fout, delimiter='\t')
tabout.writerows(rows)

Adding entries from multiple files in python

I have a question on how to add entries from 100 files (each file contains two columns) and then writing them to a new file(which will also contain two columns)?

This is very underspecified. It's not clear what your problem is.
Probabably you'd do something like:
entries = []
for f in ["file1.txt", "file2.txt", ..., "file100.txt"]:
entries.append(open(f).readlines())
o = open("output.txt", "w")
o.writelines(entries)
o.close()

Wasn't sure if you needed a solution to find all those 100 files as well?
If so, here is one approach including reading them all and writing them to a joined file:
from os import walk
from os.path import abspath
lines = []
for root, folders, files in walk('./path/'):
for file in files:
fh = open(abspath(root + '/' + file), 'rb')
lines.append(fh.read())
fh.close()
# break if you only want the first level of your directory tree
o = open('output.txt', 'wb')
o.write('\n'.join(lines))
o.close()
You could also do a "memory efficient" solution:
from os import walk
from os.path import abspath
o = open('output.txt', 'wb')
for root, folders, files in walk('./path/'):
for file in files:
fh = open(abspath(root + '/' + file), 'rb')
for line in fh.readline():
o.write(line)
del line
fh.close()
del fh
# break if you only want the first level of your directory tree
o.close()
Much of this is automated (I think) within Python, but lazy or not, if you can then remove objects from the memory after closing the files and before and before reusing variable names.. just in case?

a more scalable way, inspired by Torxed approach
from os import walk
from os.path import abspath
with open('output.txt', 'wb') as o:
for root, folders, files in walk('./path/'):
for filename in files:
with open(abspath(root + '/' + filename), 'rb') as i:
for line in i:
o.write(line)

Do you want to chain them? I.e., do you want all lines of file 1, then all lines of file 2, ...
Or do you want to merge them? Line 1 of file 1, line 1 of file 2, ...
For the first case:
from itertools import chain
filenames = ...
file_handles = [open(fn) for fn in filenames]
with open("output.txt", "w") as out_fh:
for line in chain(file_handles):
out_fh.write(line)
for fh in file_handles:
fh.close()
For the second case:
from itertools import izip_longest
filenames = ...
file_handles = [open(fn) for fn in filenames]
with open("output.txt", "w") as out_fh:
for lines in izip_longest(*file_handles, fillvalue=None):
for line in lines:
if line is not None:
out_fh.write(line)
for fh in file_handles:
fh.close()
Important: Never forget to close your files!
As #isedev pointed out, this approach is o.k. for 100 files, but as I open all handles immediately, for thousands this won't work.
If you want to overcome this problem, only option 1 (chaining) is reasonable...
filenames = ...
with open("output.txt", "w") as out_fh:
for fn in filenames:
with open(fn) as fh:
for line in fh:
out_fh.write(line)

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Compare multiple text files, and save commons values - python

Related

How to loop through each file in a folder, do some action to the file and save output to a file in another folder Python

Writting several files according to an element of an original file

Python 3.4 - Add file name + line number to all files in a folder

How to split text file by id in python

Adding entries from multiple files in python

Categories

Resources