Compare two different files line by line in python - python

I have two different files and I want to compare theirs contents line by line, and write their common contents in a different file. Note that both of them contain some blank spaces.
Here is my pseudo code:
file1 = open('some_file_1.txt', 'r')
file2 = open('some_file_2.txt', 'r')
FO = open('some_output_file.txt', 'w')
for line1 in file1:
for line2 in file2:
if line1 == line2:
FO.write("%s\n" %(line1))
FO.close()
file1.close()
file2.close()
However, by doing this, I got lots of blank spaces in my FO file. Seems like common blank spaces are also written. I want to write only the text part. Can somebody please help me.
For example: my first file (file1) contains data:
Config:
Hostname = TUVALU
BT:
TS_Ball_Update_Threshold = 0.2
BT:
TS_Player_Search_Radius = 4
BT:
Ball_Template_Update = 0
while second file (file2) contains data:
Pole_ID = 2
Width = 1280
Height = 1024
Color_Mode = 0
Sensor_Scale = 1
Tracking_ROI_Size = 4
Ball_Template_Update = 0
If you notice, last two lines of each files are the same, hence, I want to write this file in my FO file. But, the problem with my approach is that, it writes the common blank space also. Should I use regex for this problem? I do not have experience with regex.

This solution reads both files in one pass, excludes blank lines, and prints common lines regardless of their position in the file:
with open('some_file_1.txt', 'r') as file1:
with open('some_file_2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)

Yet another example...
from __future__ import print_function #Only for Python2
with open('file1.txt') as f1, open('file2.txt') as f2, open('outfile.txt', 'w') as outfile:
for line1, line2 in zip(f1, f2):
if line1 == line2:
print(line1, end='', file=outfile)
And if you want to eliminate common blank lines, just change the if statement to:
if line1.strip() and line1 == line2:
.strip() removes all leading and trailing whitespace, so if that's all that's on a line, it will become an empty string "", which is considered false.

If you are specifically looking for getting the difference between two files, then this might help:
with open('first_file', 'r') as file1:
with open('second_file', 'r') as file2:
difference = set(file1).difference(file2)
difference.discard('\n')
with open('diff.txt', 'w') as file_out:
for line in difference:
file_out.write(line)

If order is preserved between files you might also prefer difflib. Although Robᵩ's result is the bona-fide standard for intersections you might actually be looking for a rough diff-like:
from difflib import Differ
with open('cfg1.txt') as f1, open('cfg2.txt') as f2:
differ = Differ()
for line in differ.compare(f1.readlines(), f2.readlines()):
if line.startswith(" "):
print(line[2:], end="")
That said, this has a different behaviour to what you asked for (order is important) even though in this instance the same output is produced.

Once the file object is iterated, it is exausted.
>>> f = open('1.txt', 'w')
>>> f.write('1\n2\n3\n')
>>> f.close()
>>> f = open('1.txt', 'r')
>>> for line in f: print line
...
1
2
3
# exausted, another iteration does not produce anything.
>>> for line in f: print line
...
>>>
Use file.seek (or close/open the file) to rewind the file:
>>> f.seek(0)
>>> for line in f: print line
...
1
2
3

Try this:
from __future__ import with_statement
filename1 = "G:\\test1.TXT"
filename2 = "G:\\test2.TXT"
with open(filename1) as f1:
with open(filename2) as f2:
file1list = f1.read().splitlines()
file2list = f2.read().splitlines()
list1length = len(file1list)
list2length = len(file2list)
if list1length == list2length:
for index in range(len(file1list)):
if file1list[index] == file2list[index]:
print file1list[index] + "==" + file2list[index]
else:
print file1list[index] + "!=" + file2list[index]+" Not-Equel"
else:
print "difference inthe size of the file and number of lines"

I have just been faced with the same challenge, but I thought "Why programming this in Python if you can solve it with a simple "grep"?, which led to the following Python code:
import subprocess
from subprocess import PIPE
try:
output1, errors1 = subprocess.Popen(["c:\\cygwin\\bin\\grep", "-Fvf" ,"c:\\file1.txt", "c:\\file2.txt"], shell=True, stdout=PIPE, stderr=PIPE).communicate();
output2, errors2 = subprocess.Popen(["c:\\cygwin\\bin\\grep", "-Fvf" ,"c:\\file2.txt", "c:\\file1.txt"], shell=True, stdout=PIPE, stderr=PIPE).communicate();
if (len(output1) + len(output2) + len(errors1) + len(errors2) > 0):
print ("Compare result : There are differences:");
if (len(output1) + len(output2) > 0):
print (" Output differences : ");
print (output1);
print (output2);
if (len(errors1) + len(errors2) > 0):
print (" Errors : ");
print (errors1);
print (errors2);
else:
print ("Compare result : Both files are equal");
except Exception as ex:
print("Compare result : Exception during comparison");
print(ex);
raise;
The trick behind this is the following:
grep -Fvf file1.txt file2.txt verifies if all entries in file2.txt are present in file1.txt. By doing this in both directions we can see if the content of both files are "equal". I put "equal" between quotes because duplicate lines are disregarded in this way of working.
Obviously, this is just an example: you can replace grep by any commandline file comparison tool.

difflib is well worth the effort, with nice condensed output.
from pathlib import Path
import difflib
mypath = '/Users/x/lib/python3'
file17c = Path(mypath, 'oop17c.py')
file18c = Path(mypath, 'oop18c.py')
with open(file17c) as file_1:
file1 = file_1.readlines()
with open(file18c) as file_2:
file2 = file_2.readlines()
for line in difflib.unified_diff(
file1, file2, fromfile=str(file17c), tofile=str(file18c), lineterm=''):
print(line)
output
+ ... unique stuff present in file18c
- ... stuff absent in file18c but present in file17c

Related

Removing duplicate characters from a file - Python

I have a file as below,
this is Rdaaaa
thissss Is Sethaaa
hiii
I want to remove all the duplicate characters from this file..
I tried two code..
This completely removes duplicate chars but does not seem to be efficient code.
with open("test.txt", "r") as f1:
with open("test1.txt", "w") as f2:
#content = f1.readlines()
char_set = set()
while True:
char = f1.read(1)
if char not in char_set:
char_set.add(char)
f2.write(char)
if not char:
break
print(char_set)
I also tried using regex following a stackoverflow post
import re
with open("test.txt", "r") as f1:
with open("test1.txt", "w") as f2:
content = f1.read()
f2.write(re.sub(r'([a-z])\1+',r'\1',content))
But this removes thiish to thish and not thiish to this
Any suggestions on the code with improved efficiency?
For "medium" sized files that can fit into memory, this approach is a bit faster and fewer lines. You can load the whole file into memory, and then create a dictionary from it, where the dictionary's keys are the individual characters in the file. This keeps the output chars in the same order as when they were first seen (property of dict).
This ran in about 100ms for a 2 MB file with 11501 distinct characters. Your use case may make another approach better.
# replace in_file and out_file with actual paths or file names
with open(in_file, "r") as f1, open(out_file, "w") as f2:
txt = f1.read()
ordered_set = ''.join(dict.fromkeys(txt).keys())
f2.write(ordered_set)
If you have a big file, and you don't want to load it in the memory, you can read it line by line instead of character by character, which is much better and faster:
file_input = open("old_file.txt", "r")
file_output = open("new_file.txt", "w")
memory = set()
while True:
line = file_input.readline()
if not line:
break
new_line = ""
for char in line:
if char == " ":
new_line += char
continue
if char not in memory:
memory.add(char)
new_line += char
file_output.writelines(new_line)
But if the file is small, you can read it once, and apply the same logic
It looks like you want to delete letters that appear repeatedly in succession. Try using itertools.groupby:
>>> from itertools import groupby
>>> from operator import itemgetter
>>> s = '''this is Rdaaaa
... thissss Is Sethaaa
... hiii'''
>>> print(''.join(map(itemgetter(0), groupby(s))))
this is Rda
this Is Setha
hi
Try this
import re
# read the file
with open('file.txt', 'r') as f:
data = f.read()
# remove duplicate characters
data = re.sub(r'(.)\1+', r'\1', data)
# write the file
with open('file.txt', 'w') as f:
f.write(data)
The output is:
this is Rda
this Is Setha
hi

Sorting an output file A-Z

1.) I'm trying to take input from a file specified as the first argument on the command line. (working)
2.) Remove all line starting with "#" (working)
3.) Sorting the remaining lines A-Z (Not-working)
4.) Writes its output to a file named for the input file with the current time appended. (working)
How can I get point 3 working?
import sys
from datetime import *
arg = sys.argv[1]
out_file = str(arg) + "." + datetime.now().strftime("%H%M")
with open(sys.argv[1], 'r') as fin, open((out_file), 'w') as fout:
for i, line in enumerate(fin):
if i == 0 or not line.lstrip().startswith('#'):
# line = sorted(out_file())
fout.write(line)
It should work like this
import sys
from datetime import *
arg = sys.argv[1]
out_file = str(arg) + "." + datetime.now().strftime("%H%M")
result = []
with open(sys.argv[1], 'r') as fin, open((out_file), 'w') as fout:
for i, line in enumerate(fin):
if i == 0 or not line.lstrip().startswith('#'):
result.append(line)
result = sorted(result)
for line in result:
fout.write(line)
What you did wrong in your example is, that you tried to sort a single line, instead of all lines.
I suggest you using either the method fin.readlines() or the instruction list(fin) which returns a list including all the lines of the file. Then you can use the sorted built-in methods to sort this list (or do anything you want to filter this list) before writing in the output file.
Here is an example of solution, very close to your original code:
import sys
from datetime import *
arg = sys.argv[1]
out_file = str(arg) + "." + datetime.now().strftime("%H%M")
with open(sys.argv[1], 'r') as fin, open((out_file), 'w') as fout:
lines = sorted(list(fin))
for i, line in enumerate(lines):
if i == 0 or not line.lstrip().startswith('#'):
fout.write(line)
I can also suggest you this solution:
import sys
from datetime import *
arg = sys.argv[1]
out_file = str(arg) + "." + datetime.now().strftime("%H%M")
with open(sys.argv[1], 'r') as fin, open((out_file), 'w') as fout:
lines = list(fin) # Get the list of all lines
fout.write(lines.pop(0)) # Write the 1st line (and remove from list)
lines = sorted(lines) # Sort A-Z
lines = [s for s in lines if not s.lstrip().startswith('#')] # Remove lines starting with '#'
fout.write(''.join(lines)) # Write the lines
This version of the code:
writes the first line by advancing the fin iterator
uses the filter built-in function to remove the lines starting with
'#'1
the first line is not included in the filter, because the fin iterator has moved past it
sorts the filtered lines, to minimise the amount of sorting to be done2
with open(sys.argv[1], 'r') as fin, open((out_file), 'w') as fout:
fout.write(next(fin))
filtered = filter(lambda x: not x.lstrip().startswith('#'), fin)
lines = sorted(filtered)
fout.writelines(lines)
1 You could use a generator expression or a list comprehension here instead of filter
2The code assumes that every line ends with a newline

comparing two text files and extracting text in python

I have two text files, one of them contains list of ids with numbers, and the other one contains list of ids with text. I want to compare two files, and for the lines having the same id print the text inside parentheses. This is what I have so file:
import fileinput
import sys
def clean(file1):
with open(sys.argv[1], 'r') as file1: #file ppx
for line in file1:
words=line.split()
id1=words[-1]
with open(sys.argv[2], 'r') as file2: #file ids
for line in file2:
words2=line.split()
id2=words2[0]
for line in file1:
if id1==id2[0]:
text=s[s.find("(")+1:s.find(")")]
print text
The first file looks like this: http://pastebin.com/PCU6f7vz
The second file looks like this: http://pastebin.com/Y2F3gkQv
But it does not work. Can somebody tell me why?
def clean(file1):
with open(sys.argv[1], "r") as file1:
file1_lines = file1.readlines()
id1 = [line.strip().split() for line in file1_lines]
with open(sys.argv[2], "r") as file2:
file2_lines = file2.readlines()
id2 = [line.strip().split() for line in file2_lines]
id2_dict = {i[-1]:i[:-1] for i in id2}
#You can print id2_dict and id1.
#print id2_idct,
#print id1
for index, line in enumerate(file1_lines):
id1 = id1[index].strip("(").strip(")")
if id1 in id2_dict:
text = line[line.find("(")+1:line.find(")")]
print text
#or:
#text_lines = [line[line.find("(")+1:line.find(")")] for index, line in enumerate(file1_lines) if id1 in id2_dict]
#print text_lines
I don't know your mind about the output of programming, so I just think you wanted to get text_lines
file1 is an iterator that is exhausted after all the lines in the file have been read (which will happen during the first for loop). Therefore, the following loop
for line in file1:
will never run. But even if it did, the condition
if id1==id2[0]:
will never be true because you're comparing the entire id1 to the first character of id2. Furthermore, you'd be doing exactly the same comparison over and over again since those variables aren't even connected to the iterable.
And in your first two loops, you're constantly overwriting the exact same variables.
I think you need to read up on Python basics, especially the chapter on loops in the Python tutorial...
To compare the same lines(line no) in the two files:
file1 = open(sys.argv[1], "r")
file2 = open(sys.argv[2], "r")
for line1, line2 in file1,file2:
if(line1.split()[-1] == line2.split()[0]):
print line1 # use regex to extract the infromation needed
file1.close()
file2.close()
Make sure to close the files after use.

How can I compare files quicker in Python?

Is there any way to make this script faster? I'm using one file to compare another file to print lines, if second column are equal.
import csv
output =[]
a = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Phase1_missing.vcf', 'r')
list1 = a.readlines()
reader1 = a.read()
b = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais.vcf', 'r')
list2 = b.readlines()
reader2 = b.read()
f3 = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais_and_YRI.vcf', 'w')
for line1 in list1:
separar = line1.split("\t")
gene = separar[2]
for line2 in list2:
separar2 = line2.split("\t")
gene2 = separar2[2]
if gene == gene2:
print line1
f3.write(line1)
Input example (for both files):
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
The command line below works equally for same purpose in bash:
awk 'FNR==NR {a[$3]; next} $3 in a' Neandertais.vcf Phase1_missing.vcf > teste.vcf
How can I improve this Python script?
If you store your lines in dictionaries that are keyed by the column that you are interested in, you can easily use Python's built-in set functions (which run at C speed) to find the matching lines. I tested a slightly modified version of this (filenames changed, and changed split('\t') to split() because of stackoverflow formatting) and it seems to work fine:
import collections
# Use 'rb' to open files
infn1 = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Phase1_missing.vcf'
infn2 = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais.vcf'
outfn = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais_and_YRI.vcf'
def readfile(fname):
'''
Read in a file and return a dictionary of lines, keyed by the item in the second column
'''
results = collections.defaultdict(list)
# Read in binary mode -- it's quicker
with open(fname, 'rb') as f:
for line in f:
parts = line.split("\t")
if not parts:
continue
gene = parts[2]
results[gene].append(line)
return results
dict1 = readfile(infn1)
dict2 = readfile(infn2)
with open(outfn, 'wb') as outf:
# Find keys that appear in both files
for key in set(dict1) & set(dict2):
# For these keys, print all the matching
# lines in the first file
for line in dict1[key]:
print(line.rstrip())
outf.write(line)

Python: Sorting a two files based on the order of one

I've been trying to do this task all day, and I really want to learn how to do it using Python. I want to take two tab-delimited files, one with an ID only and the other with the same ID and some description. I can easily merge these files on the shared ID field with unix join, but for that I need to sort both and I want to keep the ordering of the first file.
Ive tried some code below, and my method has been to try and add things to a tuple, as from my understanding, they will keep their order as you add to it. I havent been able to get anything to work though. Can anyone help?
Sample files:
file1 ->
111889
1437390
123
27998
2525778
12
1345
file2 ->
2525778'\t'item778
1345'\t'item110
123'\t'item1000
12'\t'item8889
111889'\t'item1111
1437390'\t'item222
27998'\t'item12
output ->
111889'\t'item1111
1437390'\t'item222
123'\t'item1000
27998'\t'item12
2525778'\t'item778
12'\t'item8889
1345'\t'item110
This what I have so far:
import sys
add_list = ()
with open(sys.argv[1], 'rb') as file1, open(sys.argv[2], 'rb') as file2:
for line2 in file2:
f1, f2, f3 = line2.split('\t')
#print f1, f2, f3
for row in file1:
#print row
if row != f1:
break
else:
add_list.append(f1,f2,'\n')
break
The key is to use Python dictionaries, they are perfect for this task…
Here is a complete answer:
import sys
# Each id is mapped to its item name
# (split() splits at whitespaces (including tabulation and newline), with no empty output strings):
items = dict(line.split() for line in open(sys.argv[2])) # Inspired by mgilson's answer
with open(sys.argv[1]) as ids:
for line in ids:
id = line.rstrip() # newline removed
print '{}\t{}'.format(id, items[id])
Here is the result:
% python out.py file1.txt file2.txt
111889 item1111
1437390 item222
123 item1000
27998 item12
2525778 item778
12 item8889
1345 item110
PS: Note that I did not open the files in rb mode, as there is no need to keep the original newline bytes, here, since we get rid of trailing newlines.
I would create a dictionary which maps the ID to the field value from the second file:
with open('file2') as fin:
d = dict(x.split(None, 1) for x in fin)
Then I would use the first file to construct the output in order from the dictionary:
with open('file1') as fin, open('output', 'w') as fout:
for line in fin:
key = line.strip()
fout.write('{key}\t{value}\n'.format(key=key, value=d[key])
out = {}
with open(sys.argv[1], 'rb') as file1, open(sys.argv[2], 'rb') as file2:
d2 = {}
for line in file2:
(key, val) = line.split('\t')
d2[key] = val
lines = file1.readlines()
out = { x:d2[x] for x in lines }
I am not sure about your sorting basis.

Categories