How can I compare files quicker in Python? - python

Is there any way to make this script faster? I'm using one file to compare another file to print lines, if second column are equal.
import csv
output =[]
a = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Phase1_missing.vcf', 'r')
list1 = a.readlines()
reader1 = a.read()
b = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais.vcf', 'r')
list2 = b.readlines()
reader2 = b.read()
f3 = open('/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais_and_YRI.vcf', 'w')
for line1 in list1:
separar = line1.split("\t")
gene = separar[2]
for line2 in list2:
separar2 = line2.split("\t")
gene2 = separar2[2]
if gene == gene2:
print line1
f3.write(line1)
Input example (for both files):
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
1 14107321 rs187821037 C T 100 PASS AA=C;SNPSOURCE=LOWCOV,EXOME;AN=2184;AVGPOST=0.9996;VT=SNP;THETA=0.0006;RSQ=0.7640;LDAF=0.0006;AC=1;ERATE=0.0003;AF=0.0005;AFR_AF=0.0020;STATUS=sample_dropout
The command line below works equally for same purpose in bash:
awk 'FNR==NR {a[$3]; next} $3 in a' Neandertais.vcf Phase1_missing.vcf > teste.vcf
How can I improve this Python script?

If you store your lines in dictionaries that are keyed by the column that you are interested in, you can easily use Python's built-in set functions (which run at C speed) to find the matching lines. I tested a slightly modified version of this (filenames changed, and changed split('\t') to split() because of stackoverflow formatting) and it seems to work fine:
import collections
# Use 'rb' to open files
infn1 = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Phase1_missing.vcf'
infn2 = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais.vcf'
outfn = '/home/lucas/Doutorado/Projeto Eduardo/Exoma Neandertal/Listas_eduardo/Neandertais_and_YRI.vcf'
def readfile(fname):
'''
Read in a file and return a dictionary of lines, keyed by the item in the second column
'''
results = collections.defaultdict(list)
# Read in binary mode -- it's quicker
with open(fname, 'rb') as f:
for line in f:
parts = line.split("\t")
if not parts:
continue
gene = parts[2]
results[gene].append(line)
return results
dict1 = readfile(infn1)
dict2 = readfile(infn2)
with open(outfn, 'wb') as outf:
# Find keys that appear in both files
for key in set(dict1) & set(dict2):
# For these keys, print all the matching
# lines in the first file
for line in dict1[key]:
print(line.rstrip())
outf.write(line)

Related

How to output ONLY new additions between files with Python Difflib?

I am comparing two text files using Difflib like so:
import difflib
new_file = open(file_name, "r")
old_file = open(old_file_name, "r")
file_difference = difflib.ndiff(old_file.readlines(), new_file.readlines())
My goal is to ONLY output additions. I do not want to know about changes to existing lines. However, I've run into a problem where all changes/additions are marked with "+ ", and all subtractions are marked with "- ". I've done a lot of searching, and it appears there's no way to differentiate a line that has been changed, and a line that is brand new. I am confused on how to proceed.
import csv
f1 = open(old_file_name, "r")
contents1 = f1.read()
f2 = open(file_name, "r",)
contents2 = f2.read()
for data in contents2:
if data not in contents1:
file = open(output_path, 'a', newline='')
# writing the data into the file
with file:
write = csv.writer(file)
write.writerows(data)
A great friend of mine provided a code snippet that answered my question:
# Open the files for comparison
with open(file_name, "r") as new_file:
with open(old_file_name, "r") as old_file:
# Find the differences between the two files
file_difference = difflib.ndiff(old_file.readlines(), new_file.readlines())
new_lines = []
file_difference = tuple(x for x in file_difference)
idx = 0
fdiff_size = len(file_difference)
while idx < fdiff_size:
line = file_difference[idx]
if line.startswith("- "):
if idx + 1 < fdiff_size and file_difference[idx + 1].startswith("? "):
# this chunk is a change, so ignore this and the next 3 lines
idx += 4
continue
elif line.startswith("+ "):
new_lines.append(line)
# always iterate after new item or no change
idx += 1

Use a file to search another file and print lines matching a pattern to first file

Python noob here. I've been smashing my head trying to do this, tried several Unix tools and I'm convinced that python is the way to go.
I have two files, File1 has headers and numbers like this:
>id1
77
>id2
2
>id3
2
>id4
22
...
Note that id number is unique, but the number assigned to it may repeat. I have several files like this all with the same number of headers (~500).
File2 has all numbers of File1 and an appended sequence
1
ATCGTCATA
2
ATCGTCGTA
...
22
CCCGTCGTA
...
77
ATCGTCATA
...
Note that sequence id is unique, as all sequences after it. I have the same amount of files as File1 but the number of sequences within each File2 may vary(~150).
My desired output is the File1 with the sequence from File2, it is important that File1 maintains original order.
>id1
ATCGTCATA
>id2
ATCGTCGTA
>id3
ATCGTCGTA
>id4
CCCGTCGTA
My approach is to extract numbers from File1 and use them as a pattern to match in File2. First I am trying to make this work with only a pair of files. here is what I achieved:
#!/usr/bin/env python
import re
datafile = 'protein2683.fasta.txt.named'
schemaseqs = 'protein2683.fasta'
with open(datafile, 'r') as f:
datafile_lines = set([line.strip() for line in f]) #maybe I could use regex to get only lines with number as pattern?
print (datafile_lines)
outputlist = []
with open(schemaseqs, 'r') as f:
for line in f:
seqs = line.split(',')[0]
if seqs[1:-1] in datafile_lines:
outputlist.append(line)
print (outputlist)
This outputs a mix of patterns from File1 and the sequences from File2. Any help is appreciated.
Ps: I am open to modifications in files structure, I tried substituting \n in File2 for "," with no avail.
import re
datafile = 'protein2683.fasta.txt.named'
schemaseqs = 'protein2683.fasta'
datafile_lines = []
d = {}
prev = None
with open(datafile, 'r') as f:
i = 0
for line in f:
if i % 2 == 0:
d[line.strip()]=0
prev = line.strip()
else:
d[prev] = line.strip()
i+=1
new_d = {}
with open(schemaseqs, 'r') as f:
i=0
prev = None
for line in f:
if i % 2 == 0:
new_d[line.strip()]=0
prev = line.strip()
else:
new_d[prev] = line.strip()
i+=1
for key, value in d.items():
if value in new_d:
d[key] = new_d[value]
print(d)
with open(datafile,'w') as filee:
for k,v in d.items():
filee.writelines(k)
filee.writelines('\n')
filee.writelines(v)
filee.writelines('\n')
creating two dictionary would be easy and then map both dictionary values.
Since the files are so neatly organized, I wouldn't use a set to store the lines. Sets don't enforce order, and the order of these lines conveys a lot of information. I also wouldn't use Regex; it's probably overkill for the task of parsing individual lines, but not powerful enough to keep track of which ID corresponds to each gene sequence.
Instead, I would read the files in the opposite order. First, read the file with the gene sequences and build a mapping of IDs to genes. Then read in the first file and replace each id with the corresponding value in that mapping.
If the IDs are a continuous sequence (1, 2, 3... n, n+1), then a list is probably the easiest way to store them. If the file is already in order, you don't even have to pay attention to the ID numbers; you can just skip every other row and append each gene sequence to an array in order. If they aren't continuous, you can use a dictionary with the IDs as keys. I'll use the dictionary approach for this example:
id_to_gene_map = {}
with open(file2, 'r') as id_to_gene_file:
for line_number, line in enumerate(id_to_gene_file, start=1):
if line_number % 2 == 1: # Update ID on odd numbered lines, including line 1
current_id = line
else:
id_to_gene_map[current_id] = line # Map previous line's ID to this line's value
with open(file1, 'r') as input_file, open('output.txt', 'w') as output_file:
for line in input_file:
if not line.startswith(">"): # Keep ">id1" lines unchanged
line = id_to_gene_map[line] # Otherwise, replace with the corresponding gene
output_file.write(line)
In this case, the IDs and values both have trailing newlines. You can strip them out, but since you'll want to add them back in for writing the output file, it's probably easiest to leave them alone.

Python to compare huge text files based on multiple keys

I have two text files each of approximately 1GB where each line has 60 columns in it.
There are 6 columns which are keys to compare in each file.
Example :
file1:
4|null|null|null|null|null|3590740374739|20077|7739662|75414741|
file2:
4|null|11|333|asdsd|null|3590740374739|20077|7739662|75414741|
Here two lines are equal as columns 7,8,9 and 10 are same in two files (keys).
I tried a sample to compare files without considering keys, which works fine, but I need to compare based on the keys, not character to character in each line.
Here is the code sample I worked to compare without considering keys.
matched = open('matchedrecords.txt','w')
with open('srcone.txt') as b:
blines = set(b)
with open('srctwo.txt') as a:
alines = set(a)
with open('notInfirstSource.txt', 'w') as result:
for line in alines:
if line not in blines:
result.write(line)
else:
matched.write(line)
with open('notInsecondSource.txt', 'w') as non:
for lin in blines:
if lin not in alines:
non.write(lin)
matched.close()
This is one of the way you can compare the lines based on keys/columns but I am not sure how efficient it is.
matched =open('matchedrecords.txt','w')
with open('srcone.txt') as b:
blines = set(b)
with open('srctwo.txt') as a:
alines= set(a)
# List of columns or keys to compare
list_of_columns_to_compare=[7,8,9]
a_columns=[]
b_columns=[]
for blin in blines :
for alin in alines:
for column_no in list_of_columns_to_compare :
# Appending columns to a list to compare
b_columns.append(blin.split('|')[column_no])
a_columns.append(alin.split('|')[column_no])
if a_columns == b_columns:
matched.write(blin + " = " + alin)
Taking a cue from a recipe for KeyedSets on ActiveState, you can build a set and then simply use set intersection and set difference to produce your results:
import collections
class Set(collections.Set):
#staticmethod
def key(s): return tuple(s.split('|')[6:10])
def __init__(self, it): self._dict = {self.key(s):s for s in it}
def __len__(self): return len(self._dict)
def __iter__(self): return self._dict.itervalues()
def __contains__(self, value): return self.key(value) in self._dict
data = {}
for filename in 'srcone.txt', 'srctwo.txt':
with open(filename) as f:
data[filename] = Set(f)
with open('notInFirstSource.txt', 'w') as f:
for lines in data['srctwo.txt'] - data['srcone.txt']:
f.write(''.join(lines))
with open('notInSecondSource.txt', 'w') as f:
for lines in data['srcone.txt'] - data['srctwo.txt']:
f.write(''.join(lines))
with open('matchedrecords.txt', 'w') as f:
for lines in data['srcone.txt'] & data['srctwo.txt']:
f.write(''.join(lines))
Finally ,i could achieve this in very less time using dictionaries.
i.e a 370 MB data compared to 270 MB data file in 50 Seconds Maximum(Using a tuple as a key ).
This is the script :
reader = open("fileA",'r')
reader2 = open("fileB",'r')
TmpDict ={}
TmpDict2={}
for line in reader:
line = line.strip()
TmpArr=line.split('|')
#Forming a dictionary with below columns as keys
TmpDict[TmpArr[2],TmpArr[3],TmpArr[11],TmpArr[12],TmpArr[13],TmpArr[14]]=line
for line in reader2:
line = line.strip()
TmpArr=line.split('|')
TmpDict2[TmpArr[2],TmpArr[3],TmpArr[11],TmpArr[12],TmpArr[13],TmpArr[14]]=line
outfile = open('MatchedRecords.txt', 'w')
outfileNonMatchedB=open('notInB','w')
outfileNonMatchedA=open('notInA','w')
for k,v in TmpDict.iteritems():
if k in TmpDict2:
outfile.write(v+ '\n')
else:
outfileNonMatchedB.write(v+'\n')
outfile.close()
outfileNonMatchedB.close()
for k,v in TmpDict2.iteritems():
if k not in TmpDict:
outfileNonMatchedA.write(v+'\n')
outfileNonMatchedA.close()
Any improvements could be done to this ? Suggest me !
Thanks

Python: Sorting a two files based on the order of one

I've been trying to do this task all day, and I really want to learn how to do it using Python. I want to take two tab-delimited files, one with an ID only and the other with the same ID and some description. I can easily merge these files on the shared ID field with unix join, but for that I need to sort both and I want to keep the ordering of the first file.
Ive tried some code below, and my method has been to try and add things to a tuple, as from my understanding, they will keep their order as you add to it. I havent been able to get anything to work though. Can anyone help?
Sample files:
file1 ->
111889
1437390
123
27998
2525778
12
1345
file2 ->
2525778'\t'item778
1345'\t'item110
123'\t'item1000
12'\t'item8889
111889'\t'item1111
1437390'\t'item222
27998'\t'item12
output ->
111889'\t'item1111
1437390'\t'item222
123'\t'item1000
27998'\t'item12
2525778'\t'item778
12'\t'item8889
1345'\t'item110
This what I have so far:
import sys
add_list = ()
with open(sys.argv[1], 'rb') as file1, open(sys.argv[2], 'rb') as file2:
for line2 in file2:
f1, f2, f3 = line2.split('\t')
#print f1, f2, f3
for row in file1:
#print row
if row != f1:
break
else:
add_list.append(f1,f2,'\n')
break
The key is to use Python dictionaries, they are perfect for this task…
Here is a complete answer:
import sys
# Each id is mapped to its item name
# (split() splits at whitespaces (including tabulation and newline), with no empty output strings):
items = dict(line.split() for line in open(sys.argv[2])) # Inspired by mgilson's answer
with open(sys.argv[1]) as ids:
for line in ids:
id = line.rstrip() # newline removed
print '{}\t{}'.format(id, items[id])
Here is the result:
% python out.py file1.txt file2.txt
111889 item1111
1437390 item222
123 item1000
27998 item12
2525778 item778
12 item8889
1345 item110
PS: Note that I did not open the files in rb mode, as there is no need to keep the original newline bytes, here, since we get rid of trailing newlines.
I would create a dictionary which maps the ID to the field value from the second file:
with open('file2') as fin:
d = dict(x.split(None, 1) for x in fin)
Then I would use the first file to construct the output in order from the dictionary:
with open('file1') as fin, open('output', 'w') as fout:
for line in fin:
key = line.strip()
fout.write('{key}\t{value}\n'.format(key=key, value=d[key])
out = {}
with open(sys.argv[1], 'rb') as file1, open(sys.argv[2], 'rb') as file2:
d2 = {}
for line in file2:
(key, val) = line.split('\t')
d2[key] = val
lines = file1.readlines()
out = { x:d2[x] for x in lines }
I am not sure about your sorting basis.

Compare two different files line by line in python

I have two different files and I want to compare theirs contents line by line, and write their common contents in a different file. Note that both of them contain some blank spaces.
Here is my pseudo code:
file1 = open('some_file_1.txt', 'r')
file2 = open('some_file_2.txt', 'r')
FO = open('some_output_file.txt', 'w')
for line1 in file1:
for line2 in file2:
if line1 == line2:
FO.write("%s\n" %(line1))
FO.close()
file1.close()
file2.close()
However, by doing this, I got lots of blank spaces in my FO file. Seems like common blank spaces are also written. I want to write only the text part. Can somebody please help me.
For example: my first file (file1) contains data:
Config:
Hostname = TUVALU
BT:
TS_Ball_Update_Threshold = 0.2
BT:
TS_Player_Search_Radius = 4
BT:
Ball_Template_Update = 0
while second file (file2) contains data:
Pole_ID = 2
Width = 1280
Height = 1024
Color_Mode = 0
Sensor_Scale = 1
Tracking_ROI_Size = 4
Ball_Template_Update = 0
If you notice, last two lines of each files are the same, hence, I want to write this file in my FO file. But, the problem with my approach is that, it writes the common blank space also. Should I use regex for this problem? I do not have experience with regex.
This solution reads both files in one pass, excludes blank lines, and prints common lines regardless of their position in the file:
with open('some_file_1.txt', 'r') as file1:
with open('some_file_2.txt', 'r') as file2:
same = set(file1).intersection(file2)
same.discard('\n')
with open('some_output_file.txt', 'w') as file_out:
for line in same:
file_out.write(line)
Yet another example...
from __future__ import print_function #Only for Python2
with open('file1.txt') as f1, open('file2.txt') as f2, open('outfile.txt', 'w') as outfile:
for line1, line2 in zip(f1, f2):
if line1 == line2:
print(line1, end='', file=outfile)
And if you want to eliminate common blank lines, just change the if statement to:
if line1.strip() and line1 == line2:
.strip() removes all leading and trailing whitespace, so if that's all that's on a line, it will become an empty string "", which is considered false.
If you are specifically looking for getting the difference between two files, then this might help:
with open('first_file', 'r') as file1:
with open('second_file', 'r') as file2:
difference = set(file1).difference(file2)
difference.discard('\n')
with open('diff.txt', 'w') as file_out:
for line in difference:
file_out.write(line)
If order is preserved between files you might also prefer difflib. Although Robᵩ's result is the bona-fide standard for intersections you might actually be looking for a rough diff-like:
from difflib import Differ
with open('cfg1.txt') as f1, open('cfg2.txt') as f2:
differ = Differ()
for line in differ.compare(f1.readlines(), f2.readlines()):
if line.startswith(" "):
print(line[2:], end="")
That said, this has a different behaviour to what you asked for (order is important) even though in this instance the same output is produced.
Once the file object is iterated, it is exausted.
>>> f = open('1.txt', 'w')
>>> f.write('1\n2\n3\n')
>>> f.close()
>>> f = open('1.txt', 'r')
>>> for line in f: print line
...
1
2
3
# exausted, another iteration does not produce anything.
>>> for line in f: print line
...
>>>
Use file.seek (or close/open the file) to rewind the file:
>>> f.seek(0)
>>> for line in f: print line
...
1
2
3
Try this:
from __future__ import with_statement
filename1 = "G:\\test1.TXT"
filename2 = "G:\\test2.TXT"
with open(filename1) as f1:
with open(filename2) as f2:
file1list = f1.read().splitlines()
file2list = f2.read().splitlines()
list1length = len(file1list)
list2length = len(file2list)
if list1length == list2length:
for index in range(len(file1list)):
if file1list[index] == file2list[index]:
print file1list[index] + "==" + file2list[index]
else:
print file1list[index] + "!=" + file2list[index]+" Not-Equel"
else:
print "difference inthe size of the file and number of lines"
I have just been faced with the same challenge, but I thought "Why programming this in Python if you can solve it with a simple "grep"?, which led to the following Python code:
import subprocess
from subprocess import PIPE
try:
output1, errors1 = subprocess.Popen(["c:\\cygwin\\bin\\grep", "-Fvf" ,"c:\\file1.txt", "c:\\file2.txt"], shell=True, stdout=PIPE, stderr=PIPE).communicate();
output2, errors2 = subprocess.Popen(["c:\\cygwin\\bin\\grep", "-Fvf" ,"c:\\file2.txt", "c:\\file1.txt"], shell=True, stdout=PIPE, stderr=PIPE).communicate();
if (len(output1) + len(output2) + len(errors1) + len(errors2) > 0):
print ("Compare result : There are differences:");
if (len(output1) + len(output2) > 0):
print (" Output differences : ");
print (output1);
print (output2);
if (len(errors1) + len(errors2) > 0):
print (" Errors : ");
print (errors1);
print (errors2);
else:
print ("Compare result : Both files are equal");
except Exception as ex:
print("Compare result : Exception during comparison");
print(ex);
raise;
The trick behind this is the following:
grep -Fvf file1.txt file2.txt verifies if all entries in file2.txt are present in file1.txt. By doing this in both directions we can see if the content of both files are "equal". I put "equal" between quotes because duplicate lines are disregarded in this way of working.
Obviously, this is just an example: you can replace grep by any commandline file comparison tool.
difflib is well worth the effort, with nice condensed output.
from pathlib import Path
import difflib
mypath = '/Users/x/lib/python3'
file17c = Path(mypath, 'oop17c.py')
file18c = Path(mypath, 'oop18c.py')
with open(file17c) as file_1:
file1 = file_1.readlines()
with open(file18c) as file_2:
file2 = file_2.readlines()
for line in difflib.unified_diff(
file1, file2, fromfile=str(file17c), tofile=str(file18c), lineterm=''):
print(line)
output
+ ... unique stuff present in file18c
- ... stuff absent in file18c but present in file17c

Categories