How to diff from multiple read file in python? - python

I'm trying to diff with three files in Python on Linux.
I've got (a,b,c) three files such as
/work/a
/work/b
/work/c
The name of a file consists of absolute paths of some hex files
(for example of a file)
user/work/test0/.../bin0.hex
user/work/test0/.../bin1.hex
user/work/test0/.../bin2.hex
user/work/test0/.../bin3.hex
...
The name of b file consists of absolute path of some hex files
(for example of b file)
user/work/test1/.../bin0.hex
user/work/test1/.../bin1.hex
user/work/test1/.../bin2.hex
user/work/test1/.../bin3.hex
...
The name of c file consists of absolute path of some hex files
(for example of c)
user/work/test2/.../bin0.hex
user/work/test2/.../bin1.hex
user/work/test2/.../bin2.hex
user/work/test2/.../bin3.hex
...
and each hex file contains the string list such as
[ 0] A4B232
[ 1] 14B2F2
[ 2] 1472F1
...
I want to diff the each 3 hex files from a, b and c file.
so I started coding as below in Python. so far, I've got successfully save the data into the global variables.
arr_s_line1 = []
arr_s_line2 = []
arr_s_line3 = []
#def dfile():
# with open('a') as f1:
# f_lines1 = f1.read().splitlines();
# for f_line1 in f_lines1:
# with open(f_line1) as f2:
# s_line1 = f2.read().splitlines()
# for s_line1 in s_lines1:
# arr_s_line1.append(s_line1)
def prtf():
with open ('a') as fprtfa:
linesa = fprtfa.read().splitlines()
with open ('b') as fprtfb:
linesb = fprtfb.read().splitlines()
with open ('c') as fprtfc:
linesc = fprtfc.read().splitlines()
for linea in linesa :
with open(linea) as fa:
s_linesa = fa.read().splitlines()
for s_linea in s_linesa:
arr_s_line1.append(s_linea)
for lineb in linesb :
with open(lineb) as fb:
s_linesb = fb.read().splitlines()
for s_lineb in s_linesb:
arr_s_line2.append(s_lineb)
for linec in linesc :
with open(linec) as fc:
s_linesc = fc.read().splitlines()
for s_linec in s_linesc:
arr_s_line3.append(s_linec)
if __name__== "__main__":
prtf()
and I want to diff among with arr_s_line1[i] & arr_s_line2[i] &arr_s_line3[i]) to know whether there is mismatch or not. and If there is a happening of mismatch then I want to print the name of mismatched file and where line. How to diff from multiple read file in python?
Especially, the problem of this way is when I run a large amount of files, many memories required to run. so I want to avoid it.

Related

Reading variable length binary values from a file in python

I have three text values that I am encrypting and then writing to a file. Later I want to read the values back (in another script) and decrypt them.
I've successfully encrypted the values:
cenc = rsa.encrypt(client_name.encode('utf8'), publicKey)
denc = rsa.encrypt(expiry_date.encode('utf8'), publicKey)
fenc = rsa.encrypt(features.encode('utf8'), publicKey)
and written to a binary file:
licensefh = open("license.sfb", "wb")
licensefh.write(cenc)
licensefh.write(denc)
licensefh.write(fenc)
licensefh.close()
The three values cenc, denc and fenc are all of different lengths so when I read the file back:
licensefh = open("license.sfb", "rb")
encMessage = licensefh.read()
encMessage contains the entire file and I don't know how to get the three values back again.
I've tried using a separator between the values:
SEP = bytes(chr(0x02).encode('utf8'))
...
licensefh.write(cenc)
licensefh.write(SEP)
...
and then using encMessage.partition(SEP) or encMessage.split(SEP) but the data invariably contains the SEP value in it somewhere (I've tried a few different characters) so that didn't work.
I tried getting the length of the bytes objects cenc, denc and fenc, but this returned 256 for each value even though the contents of the variables are all different lengths.
My question is this. How do I write these three variable length values to a binary file and then separate them when I read them back again?
Here's an example of the 3 binary values:
b'tX\x10Fo\x89\x10~\x83Pok\xd1\xfb\xbe\x0e<a\xe5\x11md:\xe6\x84#\xfa\xf8\xe5\xeb\xf8\xdc{\xc0Z\xa0\xc0^\xc1\xd9\x820\xec\xec\xb0R\x99/\xa2l\x88\xa9\xa6g\xa3\x01m\xf9\x7f\x91\xb9\xe1\x80\xccs|\xb7_\xa9Fp\x11yvG\xdc\x02d\x8aK2\x92t\x0e\x1f\xca\x19\xbb&\xaf{\xc0y>\t|\x86\xab\x16.\xa5kZ"\xab6\xaaV\xf4w\x7f\xc5q\x07\xef\xa9\xa5\xa3\xf3 6\xdb\x03\x19S\xbd\x81\xf9\xc8\xc5\x90\x1e\x19\x86\xa4q\xe3?i\xc4\xac\t\xd5=3C\x9b#\xc3IuAN,\xeat\xc6\x96VFL\x1eFWZ\xa4\xd73\x92P#\x1d\xb9\x12\x15\xc9\xd4~\x8aWm^\xb8\x8b\x9d\x88\n)\xeb#\xe3\x93\xb1\\\xd6^\xe0\xce\xa2(\x05\xf5\xe6\x8b\xd1\x15\xd8v\xf0\xae\x90\xd8?\x01\r\x00\xf4\xa5\xadM|%\x98\xa9SR\xc6\xd0K\x9e&\xc3\xe0M\x81\x87\xdea\xcc\xd5\x9c\xcd\xfd1l\x1f\xb9?\xed\xd1\x95\xbc\x11\x85U9'
b'l\xd3S\xcc\x03\x9a\xf2\xfdr\xca\xbbA\x06\xfb\xd8\xbbWi\xdc\xb1\xf6&\x97T\x81Kl\r\x86\x9b\x95?\x94}\x8a\xd3\xa1V\x81\xd3]*B\x1f\x96`\xa3\xd1\xf2|B\x84?\xa0\ns\xb7\xcf\x18Y\x87\xcfR\x87!\x14\x81!\xf7\xf2\xe5x|=O\xe3\xba2\xf2!\x93\x0fT7\x0c~4\xa3\xe5\xb7\xf9wy\xb5\x12FM\x96\xd9\xfd\xedn\x9c\xacw\x1b\xc2\x17+\xb6\x05`\x10\xf8\xe4\x01\xde\xc7\xa2\xa0\x80\xd8\x15\xb1+<s\xc7\x19\x9c\x14\xb0\x1a"\x10\xbb\x0f\xe1\x05\x93\xd2?xX\xd9\x93\x8an\x8d\xcd\xbd!c\xd0,\xa45\xbai\xe3\xccx\x08\xaa,\xd1\xe5\'t\x91\xb8\xf2n$\x0c\xf9-\xb4\xc2\x07\x81\xe1\xe7\x8e\xb3\x98\x11\xf3\xa6\xd9wz\x9a3\xc9\x9c?z\xd8\xaa\x08}\xa2\x9c[\xf2\x9d\xe4\xcdb\xddl\xceV\x7f\xf1\x81\xb3\x88\x1e\x9c5?k\x0f\xc9\x86\x86&\xedV.\xa7\x8d\x13&V\xad\xca\xe5\x93\xfe\xa5\x94\xbc\xf5\xd1{Cl\xc0\x030\x92\x03\xc9'
b'#\xbdd7\xe9\xa0{\t\xb9\x87B\x9e\xf9\x97P^\xf3V\xb6\x93\x1f(J\x0b\xa3\xbf\xd8\x04\x86T\xa4\xca\xf3\xe8%\xddC\x11\xdb5\xff,\xf7\x13\xd7\xd2\xbc\xf3\x893\x83\xdcmJ\xc8p\xdf\x07V\x7fb\xeb\xa9\x8b\x0f\xca\xf9\x05\xfc\xdfS\x94b\x90\xcd\xfcn?/]\x11\xaf\xe606\xfb\\U59\xa0>\xbd\xd8\x1c\xa8\xca\x83\xf4C\x95v7\xc6\xe00\xe4,d_/\x83\xa0\xb9mO\x0e\xc4\x97J\x15\xf0\xca-\xa0\xafT\xe4\x82\x03\n\x14:\xa1\xdcL\x98\x9d,1\xfa\x10\xf4\xfd\xa0\x0b\xc7\x13!\xf7\xdb/\xda\x1a\x9df\x1cQ\xc0\x99H\x08\xa0c\x8f9/4\xc4\x05\xc6\x9eM\x8e\xe5V\xf8D\xc3\xfd\xad4\x94A\xb9[\x80\xb9\xcf\xe6\xd9\xb3M2\xd9N\xfbA\x18\x84/W\x9b\x92\xfe\xbb\xd6C\x85\xa3\xc6\xd2T\xd0\xb2\xb9\xf7R\xb4(s\xda\xbcX,9w\x17\x1c\xfb|\xa0\x87\xba\xca6>y\xba\\L4wc\x94\xe7$Y\x89\x07\x9b\xfe\x9b?{\x85'
#pippo1980 's comment is how I would do it, using struct :
import struct
cenc = b'tX\x10Fo\x89\x10~\x83Pok\xd1\xfb\xbe\x0e<a\xe5\x11md:\xe6\x84#\xfa\xf8\xe5\xeb\xf8\xdc{\xc0Z\xa0\xc0^\xc1\xd9\x820\xec\xec\xb0R\x99/\xa2l\x88\xa9\xa6g\xa3\x01m\xf9\x7f\x91\xb9\xe1\x80\xccs|\xb7_\xa9Fp\x11yvG\xdc\x02d\x8aK2\x92t\x0e\x1f\xca\x19\xbb&\xaf{\xc0y>\t|\x86\xab\x16.\xa5kZ"\xab6\xaaV\xf4w\x7f\xc5q\x07\xef\xa9\xa5\xa3\xf3 6\xdb\x03\x19S\xbd\x81\xf9\xc8\xc5\x90\x1e\x19\x86\xa4q\xe3?i\xc4\xac\t\xd5=3C\x9b#\xc3IuAN,\xeat\xc6\x96VFL\x1eFWZ\xa4\xd73\x92P#\x1d\xb9\x12\x15\xc9\xd4~\x8aWm^\xb8\x8b\x9d\x88\n)\xeb#\xe3\x93\xb1\\\xd6^\xe0\xce\xa2(\x05\xf5\xe6\x8b\xd1\x15\xd8v\xf0\xae\x90\xd8?\x01\r\x00\xf4\xa5\xadM|%\x98\xa9SR\xc6\xd0K\x9e&\xc3\xe0M\x81\x87\xdea\xcc\xd5\x9c\xcd\xfd1l\x1f\xb9?\xed\xd1\x95\xbc\x11\x85U9'
denc = b'l\xd3S\xcc\x03\x9a\xf2\xfdr\xca\xbbA\x06\xfb\xd8\xbbWi\xdc\xb1\xf6&\x97T\x81Kl\r\x86\x9b\x95?\x94}\x8a\xd3\xa1V\x81\xd3]*B\x1f\x96`\xa3\xd1\xf2|B\x84?\xa0\ns\xb7\xcf\x18Y\x87\xcfR\x87!\x14\x81!\xf7\xf2\xe5x|=O\xe3\xba2\xf2!\x93\x0fT7\x0c~4\xa3\xe5\xb7\xf9wy\xb5\x12FM\x96\xd9\xfd\xedn\x9c\xacw\x1b\xc2\x17+\xb6\x05`\x10\xf8\xe4\x01\xde\xc7\xa2\xa0\x80\xd8\x15\xb1+<s\xc7\x19\x9c\x14\xb0\x1a"\x10\xbb\x0f\xe1\x05\x93\xd2?xX\xd9\x93\x8an\x8d\xcd\xbd!c\xd0,\xa45\xbai\xe3\xccx\x08\xaa,\xd1\xe5\'t\x91\xb8\xf2n$\x0c\xf9-\xb4\xc2\x07\x81\xe1\xe7\x8e\xb3\x98\x11\xf3\xa6\xd9wz\x9a3\xc9\x9c?z\xd8\xaa\x08}\xa2\x9c[\xf2\x9d\xe4\xcdb\xddl\xceV\x7f\xf1\x81\xb3\x88\x1e\x9c5?k\x0f\xc9\x86\x86&\xedV.\xa7\x8d\x13&V\xad\xca\xe5\x93\xfe\xa5\x94\xbc\xf5\xd1{Cl\xc0\x030\x92\x03\xc9'
fenc = b'#\xbdd7\xe9\xa0{\t\xb9\x87B\x9e\xf9\x97P^\xf3V\xb6\x93\x1f(J\x0b\xa3\xbf\xd8\x04\x86T\xa4\xca\xf3\xe8%\xddC\x11\xdb5\xff,\xf7\x13\xd7\xd2\xbc\xf3\x893\x83\xdcmJ\xc8p\xdf\x07V\x7fb\xeb\xa9\x8b\x0f\xca\xf9\x05\xfc\xdfS\x94b\x90\xcd\xfcn?/]\x11\xaf\xe606\xfb\\U59\xa0>\xbd\xd8\x1c\xa8\xca\x83\xf4C\x95v7\xc6\xe00\xe4,d_/\x83\xa0\xb9mO\x0e\xc4\x97J\x15\xf0\xca-\xa0\xafT\xe4\x82\x03\n\x14:\xa1\xdcL\x98\x9d,1\xfa\x10\xf4\xfd\xa0\x0b\xc7\x13!\xf7\xdb/\xda\x1a\x9df\x1cQ\xc0\x99H\x08\xa0c\x8f9/4\xc4\x05\xc6\x9eM\x8e\xe5V\xf8D\xc3\xfd\xad4\x94A\xb9[\x80\xb9\xcf\xe6\xd9\xb3M2\xd9N\xfbA\x18\x84/W\x9b\x92\xfe\xbb\xd6C\x85\xa3\xc6\xd2T\xd0\xb2\xb9\xf7R\xb4(s\xda\xbcX,9w\x17\x1c\xfb|\xa0\x87\xba\xca6>y\xba\\L4wc\x94\xe7$Y\x89\x07\x9b\xfe\x9b?{\x85'
packing_format = "<HHH" # little-endian, 3 * (2-byte unsigned short)
with open("license.sfb", "wb") as licensefh:
licensefh.write(struct.pack(packing_format, len(cenc), len(denc), len(fenc)))
licensefh.write(cenc)
licensefh.write(denc)
licensefh.write(fenc)
# close is automatic with a context-manager
with open("license.sfb", "rb") as licensefh2:
header_length = struct.calcsize(packing_format)
cenc2_len, denc2_len, fenc2_len = struct.unpack(packing_format, licensefh2.read(header_length))
cenc2 = licensefh2.read(cenc2_len)
denc2 = licensefh2.read(denc2_len)
fenc2 = licensefh2.read(fenc2_len)
assert len(cenc2) == cenc2_len and len(denc2) == denc2_len and len(fenc2) == fenc2_len # the file was not truncated
unread_bytes = licensefh2.read() # until EOF
assert len(unread_bytes) == 0 # there is nothing else in the file, everything has been read
assert cenc == cenc2
assert denc == denc2
assert fenc == fenc2

Split huge file into n files keeping first 7 columns + next 3 columns until column n

I have a huge data frame with columns names:
A,B,C,D,F,G,H,GT_a,N_a_,E_a,GT_b,N_b_,E_b,GT_c,N_c_,E_c,...,GT_n,N_n,E_n
Using unix/bash or python, I want to produce n individual files with the following columns:
A,B,C,D,F,G,H,GT_a,N_a_,E_a
A,B,C,D,F,G,H,GT_b,N_b_,E_b
A,B,C,D,F,G,H,GT_c,N_c_,E_c
....
A,B,C,D,F,G,H,GT_n,N_n_,E_n
Each file should be called: a.txt, b.txt, c.txt,...,n.txt
Here are a couple of solutions with bash tools.
1. bash
Using cut inside a bash loop.This will raise n processes and parse the file n times.
Update for the case we don't have just a sequence of letters as _ids in column names, but many string ids, repeating the same every 3 lines after the first 7 lines. We have to first read the header of the file and extract them, e.g. a quick solution is to use awk and print them every 8th, 11th, etc column into the bash array.
#!/bin/bash
first=7
#ids=( {a..n} )
ids=( $( head -1 "$1" | awk -F"_" -v RS="," -v f="$first" 'NR>f && (NR+1)%3==0{print $2}' ) )
for i in "${!ids[#]}"; do
cols="1-$first,$((first+1+3*i)),$((first+2+3*i)),$((first+3+3*i))"
cut -d, -f"$cols" "$1" > "${ids[i]}.txt"
done
Usage: bash test.sh file
2. awk
Or you can use awk. Here I customize just the number of outputs, but the others can also be done like in the first solution.
BEGIN { FS=OFS=","; times=14 }
{
for (i=1;i<=times;i++) {
print $1,$2,$3,$4,$5,$6,$7,$(5+3*i),$(6+3*i),$(7+3*i) > sprintf("%c.txt",i+96)
}
}
Usage: awk -f test.awk file.
This solution should be fast, as it parses the file once. But it shouldn't be used like this, for a large number of output files, it could throw a "too many files open" error. For the range of the letters, it should be ok.
This should write out the different files, with different headers for each file. You'll have to change the COL_NAMES_TO_WRITE to be the ones that you want.
It uses the standard library, so no pandas. It won't write out more than 26 different files.. but the filename generator could be changed to augment that and allow that.
If I'm interpreting this question correctly, you want to split this into 14 files (a..n)
You'll have to copy this code below into a file, splitter.py
And then run this command:
python3.8 splitter.py --fn largefile.txt -n 14
Where largefile.txt is your huge file that you need to split.
import argparse
import csv
import string
COL_NAMES_TO_WRITE = "A,B,C,D,F,G,H,GT_{letter},N_{letter},E_{letter}"
WRITTEN_HEADERS = set() # place to keep track of whether headers have been written
def output_file_generator(num):
if num > 26: raise ValueError(f"Can only print out 26 different files, not {num}")
i = 0
while True:
prefix = string.ascii_lowercase[i]
i = (i + 1) % num # increment modulo number of files we want
yield f"{prefix}.txt"
def col_name_generator(num):
i = 0
while True:
col_suffix = string.ascii_lowercase[i]
i = (i + 1) % num # increment modulo number of files we want
print( COL_NAMES_TO_WRITE.format(letter=col_suffix).split(','))
yield COL_NAMES_TO_WRITE.format(letter=col_suffix).split(',')
def main(filename, num_files=4):
"""Split a file into multiple files
Args:
filename (str): large filename that needs to be split into multiple files
num_files (int): number of files to split filename into
"""
print(filename)
with open(filename, 'r') as large_file_fp:
reader = csv.DictReader(large_file_fp)
output_files = output_file_generator(num_files)
col_names = col_name_generator(num_files)
for line in reader:
print(line)
filename_for_this_file = output_files.__next__()
print("filename ", filename_for_this_file)
column_names_for_this_file = col_names.__next__()
print("col names:", column_names_for_this_file)
with open(filename_for_this_file, 'a') as output_fp:
writer = csv.DictWriter(output_fp, fieldnames=column_names_for_this_file)
if filename_for_this_file not in WRITTEN_HEADERS:
writer.writeheader()
WRITTEN_HEADERS.add(filename_for_this_file)
just_these_fields = {k:v for k,v in line.items() if k in column_names_for_this_file}
writer.writerow(just_these_fields)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-fn", "--filename", required=True, default='large_file.txt', help="filename of large file to be split")
parser.add_argument("-n", "--num_files", required=False, default=4, help="number of separate files to split large_file into")
args = parser.parse_args()
main(args.filename, int(args.num_files))
import pandas as pd
import numpy as np
c = "A,B,C,D,F,G,H,GT_a,N_a_,E_a,GT_b,N_b_,E_b,GT_c,N_c_,E_c,GT_d,N_d_,E_d,GT_e,N_e_,E_e".split(',')
df = pd.DataFrame(np.full((30, 22), c), columns=c)
c = None
c = list(df.columns)
default = c[:7]
var = np.matrix(c[7:])
var = pd.DataFrame(var.reshape(var.shape[1]//3, 3))
def dump(row):
cols = default + list(row)
magic = cols[-1][-1]
df[cols].to_csv(magic + '.txt')
var.apply(dump, axis=1)

Renaming Multiple Files at Once in a Directory

I am attempting to take a file name such as 'OP 40 856101.txt' from a directory, remove the .txt, set each single word to a specific variable, then reorder the filename based on a required order such as '856101 OP 040'. Below is my code:
import os
dir = 'C:/Users/brian/Documents/Moeller'
orig = os.listdir(dir) #original names of the files in the folder
for orig_name in orig: #This loop splits each file name into a list of stings containing each word
f = os.path.splitext(orig_name)[0]
sep = f.split() #Separation is done by a space
for t in sep: #Loops across each list of strings into an if statement that saves each part to a specific variable
#print(t)
if t.isalpha() and len(t) == 3:
wc = t
elif len(t) > 3 and len(t) < 6:
wc = t
elif t == 'OP':
op = t
elif len(t) >= 4:
pnum = t
else:
opnum = t
if len(opnum) == 2:
opnum = '0' + opnum
new_nam = '%s %s %s %s' % (pnum,op,opnum, wc) #This is the variable that contain the text for the new name
print("The orig filename is %r, the new filename is %r" % (orig_name, new_nam))
os.rename(orig_name, new_nam)
However I am getting an error with my last for loop where I attempt to rename each file in the directory.
FileNotFoundError: [WinError 2] The system cannot find the file specified: '150 856101 OP CLEAN.txt' -> '856101 OP 150 CLEAN'
The code runs perfectly until the os.rename() command, if I print out the variable new_nam, it prints out the correct naming order for all of the files in the directory. Seems like it cannot find the original file though to replace the filename to the string in new_nam. I assume it is a directory issue, however I am newer to python and can't seem to figure where to edit my code. Any tips or advice would be greatly appreciated!
Try this (just changed the last line):
os.rename(os.path.join(dir,orig_name), os.path.join(dir,new_nam))
You need to tell Python the actual path of the file to rename - otherwise, it looks only in the directory containing this file.
Incidentally, it's better not to use dir as a variable name, because that's the name of a built-in.

how to create an index to parse big text file

I have two files A and B in FASTQ format, which are basically several hundred million lines of text organized in groups of 4 lines starting with an # as follows:
#120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
GCCAATGGCATGGTTTCATGGATGTTAGCAGAAGACATGAGACTTCTGGGACAGGAGCAAAACACTTCATGATGGCAAAAGATCGGAAGAGCACACGTCTGAACTCN
+120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
bbbeee_[_ccdccegeeghhiiehghifhfhhhiiihhfhghigbeffeefddd]aegggdffhfhhihbghhdfffgdb^beeabcccabbcb`ccacacbbccB
I need to compare the
5:1101:1156:2031#0/
part between files A and B and write the groups of 4 lines in file B that matched to a new file. I got a piece of code in python that does that, but only works for small files as it parses through the entire #-lines of file B for every #-line in file A, and both files contain hundreds of millions of lines.
Someone suggested that I should create an index for file B; I have googled around without success and would be very grateful if someone could point out how to do this or let me know of a tutorial so I can learn. Thanks.
==EDIT==
In theory each group of 4 lines should only exist once in each file. Would it increase the speed enough if breaking the parsing after each match or do I need a different algorithm altogether?
An index is just a shortened version of the information you are working with. In this case, you will want the "key" - the text between the first colon(':') on the #-line and the final slash('/') near the end - as well as some kind of value.
Since the "value" in this case is the entire contents of the 4-line block, and since our index is going to store a separate entry for each block, we would be storing the entire file in memory if we used the actual value in the index.
Instead, let's use the file position of the beginning of the 4-line block. That way, you can move to that file position, print 4 lines, and stop. Total cost is the 4 or 8 or however many bytes it takes to store an integer file position, instead of however-many bytes of actual genome data.
Here is some code that does the job, but also does a lot of validation and checking. You might want to throw stuff away that you don't use.
import sys
def build_index(path):
index = {}
for key, pos, data in parse_fastq(path):
if key not in index:
# Don't overwrite duplicates- use first occurrence.
index[key] = pos
return index
def error(s):
sys.stderr.write(s + "\n")
def extract_key(s):
# This much is fairly constant:
assert(s.startswith('#'))
(machine_name, rest) = s.split(':', 1)
# Per wikipedia, this changes in different variants of FASTQ format:
(key, rest) = rest.split('/', 1)
return key
def parse_fastq(path):
"""
Parse the 4-line FASTQ groups in path.
Validate the contents, somewhat.
"""
f = open(path)
i = 0
# Note: iterating a file is incompatible with fh.tell(). Fake it.
pos = offset = 0
for line in f:
offset += len(line)
lx = i % 4
i += 1
if lx == 0: # #machine: key
key = extract_key(line)
len1 = len2 = 0
data = [ line ]
elif lx == 1:
data.append(line)
len1 = len(line)
elif lx == 2: # +machine: key or something
assert(line.startswith('+'))
data.append(line)
else: # lx == 3 : quality data
data.append(line)
len2 = len(line)
if len2 != len1:
error("Data length mismatch at line "
+ str(i-2)
+ " (len: " + str(len1) + ") and line "
+ str(i)
+ " (len: " + str(len2) + ")\n")
#print "Yielding #%i: %s" % (pos, key)
yield key, pos, data
pos = offset
if i % 4 != 0:
error("EOF encountered in mid-record at line " + str(i));
def match_records(path, index):
results = []
for key, pos, d in parse_fastq(path):
if key in index:
# found a match!
results.append(key)
return results
def write_matches(inpath, matches, outpath):
rf = open(inpath)
wf = open(outpath, 'w')
for m in matches:
rf.seek(m)
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
rf.close()
wf.close()
#import pdb; pdb.set_trace()
index = build_index('afile.fastq')
matches = match_records('bfile.fastq', index)
posns = [ index[k] for k in matches ]
write_matches('afile.fastq', posns, 'outfile.fastq')
Note that this code goes back to the first file to get the blocks of data. If your data is identical between files, you would be able to copy the block from the second file when a match occurs.
Note also that depending on what you are trying to extract, you may want to change the order of the output blocks, and you may want to make sure that the keys are unique, or perhaps make sure the keys are not unique but are repeated in the order they match. That's up to you - I'm not sure what you're doing with the data.
these guys claim to parse a few gigs file while using a dedicated library, see http://www.biostars.org/p/15113/
fastq_parser = SeqIO.parse(fastq_filename, "fastq")
wanted = (rec for rec in fastq_parser if ...)
SeqIO.write(wanted, output_file, "fastq")
a better approach IMO would be to parse it once and load the data to some database instead of that output_file (i.e mysql) and latter run the queries there

Should I be comparing bytes using struct?

I'm trying to compare the data within two files, and retrieve a list of offsets of where the differences are.
I tried it on some text files and it worked quite well..
However on non-text files (that still contain ascii text), I call them binary data files. (executables, so on..)
It seems to think some bytes are the same, even though when I look at it in hex editor, they are obviously not. I tried printing out this binary data that it thinks is the same and I get blank lines where it should be printed.
Thus, I think this is the source of the problem.
So what is the best way to compare bytes of data that could be both binary and contain ascii text? I thought using the struct module by be a starting point...
As you can see below, I compare the bytes with the == operator
Here's the code:
import os
import math
#file1 = 'file1.txt'
#file2 = 'file2.txt'
file1 = 'file1.exe'
file2 = 'file2.exe'
file1size = os.path.getsize(file1)
file2size = os.path.getsize(file2)
a = file1size - file2size
end = file1size #if they are both same size
if a > 0:
#file 2 is smallest
end = file2size
big = file1size
elif a < 0:
#file 1 is smallest
end = file1size
big = file2size
f1 = open(file1, 'rb')
f2 = open(file2, 'rb')
readSize = 500
r = readSize
off = 0
data = []
looking = False
d = open('data.txt', 'w')
while off < end:
f1.seek(off)
f2.seek(off)
b1, b2 = f1.read(r), f2.read(r)
same = b1 == b2
print ''
if same:
print 'Same at: '+str(off)
print 'readSize: '+str(r)
print b1
print b2
print ''
#save offsets of the section of "different" bytes
#data.append([diffOff, diffOff+off-1]) #[begin diff off, end diff off]
if looking:
d.write(str(diffOff)+" => "+str(diffOff+off-2)+"\n")
looking = False
r = readSize
off = off + 1
else:
off = off + r
else:
if r == 1:
looking = True
diffOff = off
off = off + 1 #continue reading 1 at a time, until u find a same reading
r = 1 #it will shoot back to the last off, since we didn't increment it here
d.close()
f1.close()
f2.close()
#add the diff ending portion to diff data offs, if 1 file is longer than the other
a = int(math.fabs(a)) #get abs val of diff
if a:
data.append([big-a, big-1])
print data
Did you try difflib and filecmp modules?
This module provides classes and
functions for comparing sequences. It
can be used for example, for comparing
files, and can produce difference
information in various formats,
including HTML and context and unified
diffs. For comparing directories and
files, see also, the filecmp module.
The filecmp module defines functions
to compare files and directories, with
various optional time/correctness
trade-offs. For comparing files, see
also the difflib module
.
You are probably encountering encoding/decoding problems. Someone may suggest a better solution, but you could try reading the file into a bytearray so you're reading raw bytes instead of decoded characters:
Here's a crude example:
$ od -Ax -tx1 /tmp/aa
000000 e0 b2 aa 0a
$ od -Ax -tx1 /tmp/bb
000000 e0 b2 bb 0a
$ cat /tmp/diff.py
a = bytearray(open('/tmp/aa', 'rb').read())
b = bytearray(open('/tmp/bb', 'rb').read())
print "%02x, %02x" % (a[2], a[3])
print "%02x, %02x" % (b[2], b[3])
$ python /tmp/diff.py
aa, 0a
bb, 0a

Categories