Read between 2 offsets of a file - python

I was wondering how the read() function can be used to read between 2 offsets that are in hex?
I tried using this to convert the offset values to int, but I get a syntax error for the read.() line. Any ideas?
OFFSETS = ('3AF7','3ECF')
OFFSETE = ('3B04','3EDE')
for r, d, f in os.walk("."):
for hahahoho, value in enumerate(OFFSETS and OFFSETE):
try:
with open(os.path.join(r,f), 'rb' ) as fileread:
texttoprint = fileread.seek(int(OFFSETS[hahahoho], 16) -1)
yeeha = texttoprint.read[int(OFFSETS[hahahoho], 16) -1 : int(OFFSETE[damn],16)]
print (yeeha)
hahahoho + 1
this is not the entire code thou, just posted the ones i need help with =(
EDIT:
Alright, i think i should listen to the advice of you people this is the entire code
nost = 1
OFFSETS = ('3AF7','3ECF')
OFFSETE = ('3B04','3EDE')
endscript = 'No'
nooffile = 1
import os, glob, sys, tempfile
try:
directory = input('Enter your directory:').replace('\\','\\\\')
os.chdir(directory)
except FileNotFoundError:
print ('Directory not found!')
endscript = 'YES!'
if endscript == 'YES!':
sys.exit('Error. Be careful of what you do to your computer!')
else:
if os.path.isfile('Done.txt') == True:
print ('The folder has already been modified!')
else:
print ('Searching texts...\r\n')
print ('Printing...')
for r, d, f in os.walk("."):
for HODF in f:
if HODF.endswith(".hod") or "." not in HODF:
for damn, value in enumerate(OFFSETS and OFFSETE):
try:
with open(os.path.join(r,HODF), 'rb' ) as fileread:
fileread.seek(int(OFFSETS[damn],16) -1)
yeeha = fileread.read(int(OFFSETE[damn], 16) - (int(OFFSETS[damn],16) -1))
if b'?\x03\x00\x00\x00\x01\x00\x00\x00Leg2.' not in yeeha and b'?\x03\x00\x00\x00\x01\x00\x00\x00Leg2_r.' not in yeeha:
print (yeeha)
damn + 1
except FileNotFoundError:
print('Invalid file path!')
os._exit(1)
except IndexError:
print ('File successfully modified!')
nooffile = nooffile + 1
nost = 1
print ('\r\n'+str(nooffile)+' files read.',)
print ('\tANI_list.txt, End.dat, Group.txt, Head.txt, Tail.dat files ignored.')
print ('\r\nFiles successfully read! Hope you found what you are looking for!')
May I know whats wrong with it? Cause it works just fine for me

There are other problems with your code, but it sounds like you want to solve that yourself. When it comes to reading a particular byte range from a file, you can do that like this:
start = 1000
end = 1020 # Just examples
fileread.seek(start)
stuff = fileread.read(end - start)
That is, you start by seeking to the start position, and then you read as many bytes as you need (that is 20, in this example).
EDIT:
The only real "problem" with your code is that you're using enumerate in a strange and weird fashion that makes it completely unnecessary. The expression OFFSETS and OFFSETE will simply evaluate to OFFSETE, making OFFSETS and completely superfluous in it. Then, you're only actually using the first value from enumerate (the index), which makes enumerate itself superfluous: You could just have used range(len(OFFSETE)) instead.
More proper, however, would be to loop directly over the values instead of going via an index, like this:
for start, end in zip(OFFSETS, OFFSETE):
# snip
fileread.seek(int(start, 16) - 1)
yeeha = fileread.read(int(start, 16) - int(end, 16) - 1)
The other things are more like slight uglinesses that could be eliminated to make your code much nicer, but aren't strictly speaking wrong. Among them are that you don't need to represent your offsets as strings, but could use hexadecimal literals instead; that you open the file multiple times for no reason, that the hohohaha + 1 expression is completely superfluous, and that you could just bake the - 1 extra offsets directly into your actual offsets instead of adding it later.
I would write it closer to this instead:
OFFSETS = [0x3AF7 - 1, 0x3ECF - 2]
OFFSETE = [0x3B04 - 1, 0x3EDE - 2]
for r, d, f in os.walk("."):
for fn in f:
with open(os.path.join(r, fn), "rb") as fp:
for start, end in zip(OFFSETS, OFFSETE):
fp.seek(start)
yeeha = fp.read(start - end)
# Do whatever it is you need with yeeha

Related

Python text file manipulation, add delta time to each line in seconds

I am a beginner at python and trying to solve the below:
I have a text file that each line starts like this:
<18:12:53.972>
<18:12:53.975>
<18:12:53.975>
<18:12:53.975>
<18:12:54.008>
etc
Instead of above I would like to add the elapsed time in seconds in the beginning of each line, but only if the line starts with '<'.
<0.0><18:12:53.972>
<0.003><18:12:53.975>
<0.003><18:12:53.975>
<0.003><18:12:53.975>
<0.036><18:12:54.008>
etc
Here comes a try :-)
#import datetime
from datetime import timedelta
from sys import argv
#get filename as argument
run, input, output = argv
#get number of lines for textfile
nr_of_lines = sum(1 for line in open(input))
#read in file
f = open(input)
lines = f.readlines()
f.close
#declarations
do_once = True
time = []
delta_to_list = []
i = 0
#read in and translate all timevalues from logfile to delta time.
while i < nr_of_lines:
i += 1
if lines[i-1].startswith('<'):
get_lines = lines[i-1] #get one line
get_time = (get_lines[1:13]) #get the time from that line
h = int(get_time[0:2])
m = int(get_time[3:5])
s = int(get_time[6:8])
ms = int(get_time[9:13])
time = timedelta(hours = h, minutes = m, seconds = s, microseconds = 0, milliseconds = ms)
sec_time = time.seconds + (ms/1000)
if do_once:
start_value = sec_time
do_once = False
delta = float("{0:.3f}".format(sec_time - start_value))
delta_to_list.append(delta)
#write back values to logfile.
k=0
s = str(delta_to_list[k])
with open(output, 'w') as out_file:
with open(input, 'r') as in_file:
for line in in_file:
if line.startswith('<'):
s = str(delta_to_list[k])
out_file.write("<" + s + ">" + line)
else:
out_file.write(line)
k += 1
As it is now, it works fine, but the last two lines is not written to the new file. It says: "s = str(delta_to_list[k]) IndexError: list index out of range.
At first I would like to get my code working, and second a suggestions for improvements. Thank you!
First point: never read a full file in memory when you don't have too (and specially when you don't know whether you have enough free memory).
Second point: learn to use python's for loop and iteration protocol. The way to iterate over a list and any other iterable is:
for item in some_iterable:
do_something_with(item)
This avoids messing with indexes and getting it wrong ;)
One of the nice things with Python file objects is that they actually are iterables, so to iterate over a file lines, the simplest way is:
for line in my_opened_file:
do_something_with(line)
Here's a simple yet working and mostly pythonic (nb: python 2.7.x) way to write your program:
# -*- coding: utf-8 -*-
import os
import sys
import datetime
import re
import tempfile
def totime(timestr):
""" returns a datetime object for a "HH:MM:SS" string """
# we actually need datetime objects for substraction
# so let's use the first available bogus date
# notes:
# `timestr.split(":")` will returns a list `["MM", "HH", "SS]`
# `map(int, ...)` will apply `int()` on each item
# of the sequence (second argument) and return
# the resulting list, ie
# `map(int, "01", "02", "03")` => `[1, 2, 3]`
return datetime.datetime(1900, 1, 1, *map(int, timestr.split(":")))
def process(instream, outstream):
# some may consider that regexps are not that pythonic
# but as far as I'm concerned it seems like a sensible
# use case.
time_re = re.compile("^<(?P<time>\d{2}:\d{2}:\d{2})\.")
first = None
# iterate over our input stream lines
for line in instream:
# should we handle this line at all ?
# (nb a bit redundant but faster than re.match)
if not line.startswith("<"):
continue
# looks like a candidate, let's try and
# extract the 'time' value from it
match = time_re.search(line)
if not match:
# starts with '<' BUT not followed by 'HH:MM:SS.' ?
# unexpected from the sample source but well, we
# can't do much about it either
continue
# retrieve the captured "time" (HH:MM:SS) part
current = totime(match.group("time"))
# store the first occurrence so we can
# compute the elapsed time
if first is None:
first = current
# `(current - first)` yields a `timedelta` object
# we now just have to retrieve it's `seconds` attribute
seconds = (current - first).seconds
# inject the seconds before the line
# and write the whole thing tou our output stream
newline = "{}{}".format(seconds, line)
outstream.write(newline)
def usage(err=None):
if err:
print >> sys.stderr, err
print >> sys.stderr, "usage: python retime.py <filename>"
# unix standards process exit codes
return 2 if err else 0
def main(*args):
# our entry point...
# gets the source filename, process it
# (storing the results in a temporary file),
# and if everything's ok replace the source file
# by the temporary file.
try:
sourcename = args[0]
except IndexError as e:
return usage("missing <filename> argument")
# `delete=False` prevents the tmp file to be
# deleted on closing.
dest = tempfile.NamedTemporaryFile(delete=False)
with open(sourcename) as source:
try:
process(source, dest)
except Exception as e:
dest.close()
os.remove(dest)
raise
# ok done
dest.close()
os.rename(dest.name, sourcename)
return 0
if __name__ == "__main__":
# only execute main() if we are called as a script
# (so we can also import this file as a module)
sys.exit(main(*sys.argv[1:]))
It gives the expected results on your sample data (running on linux - but it should be ok on any other supported OS afaict).
Note that I wrote it to work like your original code (replace the source file with the processed one), but if it were my code I would instead either explicitely provide a destination filename or as a default write to sys.stdout instead (and redirect stdout to another file). The process function can deal with any of those solution FWIW - it's only a matter of a couple edits in main().

how to create an index to parse big text file

I have two files A and B in FASTQ format, which are basically several hundred million lines of text organized in groups of 4 lines starting with an # as follows:
#120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
GCCAATGGCATGGTTTCATGGATGTTAGCAGAAGACATGAGACTTCTGGGACAGGAGCAAAACACTTCATGATGGCAAAAGATCGGAAGAGCACACGTCTGAACTCN
+120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
bbbeee_[_ccdccegeeghhiiehghifhfhhhiiihhfhghigbeffeefddd]aegggdffhfhhihbghhdfffgdb^beeabcccabbcb`ccacacbbccB
I need to compare the
5:1101:1156:2031#0/
part between files A and B and write the groups of 4 lines in file B that matched to a new file. I got a piece of code in python that does that, but only works for small files as it parses through the entire #-lines of file B for every #-line in file A, and both files contain hundreds of millions of lines.
Someone suggested that I should create an index for file B; I have googled around without success and would be very grateful if someone could point out how to do this or let me know of a tutorial so I can learn. Thanks.
==EDIT==
In theory each group of 4 lines should only exist once in each file. Would it increase the speed enough if breaking the parsing after each match or do I need a different algorithm altogether?
An index is just a shortened version of the information you are working with. In this case, you will want the "key" - the text between the first colon(':') on the #-line and the final slash('/') near the end - as well as some kind of value.
Since the "value" in this case is the entire contents of the 4-line block, and since our index is going to store a separate entry for each block, we would be storing the entire file in memory if we used the actual value in the index.
Instead, let's use the file position of the beginning of the 4-line block. That way, you can move to that file position, print 4 lines, and stop. Total cost is the 4 or 8 or however many bytes it takes to store an integer file position, instead of however-many bytes of actual genome data.
Here is some code that does the job, but also does a lot of validation and checking. You might want to throw stuff away that you don't use.
import sys
def build_index(path):
index = {}
for key, pos, data in parse_fastq(path):
if key not in index:
# Don't overwrite duplicates- use first occurrence.
index[key] = pos
return index
def error(s):
sys.stderr.write(s + "\n")
def extract_key(s):
# This much is fairly constant:
assert(s.startswith('#'))
(machine_name, rest) = s.split(':', 1)
# Per wikipedia, this changes in different variants of FASTQ format:
(key, rest) = rest.split('/', 1)
return key
def parse_fastq(path):
"""
Parse the 4-line FASTQ groups in path.
Validate the contents, somewhat.
"""
f = open(path)
i = 0
# Note: iterating a file is incompatible with fh.tell(). Fake it.
pos = offset = 0
for line in f:
offset += len(line)
lx = i % 4
i += 1
if lx == 0: # #machine: key
key = extract_key(line)
len1 = len2 = 0
data = [ line ]
elif lx == 1:
data.append(line)
len1 = len(line)
elif lx == 2: # +machine: key or something
assert(line.startswith('+'))
data.append(line)
else: # lx == 3 : quality data
data.append(line)
len2 = len(line)
if len2 != len1:
error("Data length mismatch at line "
+ str(i-2)
+ " (len: " + str(len1) + ") and line "
+ str(i)
+ " (len: " + str(len2) + ")\n")
#print "Yielding #%i: %s" % (pos, key)
yield key, pos, data
pos = offset
if i % 4 != 0:
error("EOF encountered in mid-record at line " + str(i));
def match_records(path, index):
results = []
for key, pos, d in parse_fastq(path):
if key in index:
# found a match!
results.append(key)
return results
def write_matches(inpath, matches, outpath):
rf = open(inpath)
wf = open(outpath, 'w')
for m in matches:
rf.seek(m)
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
rf.close()
wf.close()
#import pdb; pdb.set_trace()
index = build_index('afile.fastq')
matches = match_records('bfile.fastq', index)
posns = [ index[k] for k in matches ]
write_matches('afile.fastq', posns, 'outfile.fastq')
Note that this code goes back to the first file to get the blocks of data. If your data is identical between files, you would be able to copy the block from the second file when a match occurs.
Note also that depending on what you are trying to extract, you may want to change the order of the output blocks, and you may want to make sure that the keys are unique, or perhaps make sure the keys are not unique but are repeated in the order they match. That's up to you - I'm not sure what you're doing with the data.
these guys claim to parse a few gigs file while using a dedicated library, see http://www.biostars.org/p/15113/
fastq_parser = SeqIO.parse(fastq_filename, "fastq")
wanted = (rec for rec in fastq_parser if ...)
SeqIO.write(wanted, output_file, "fastq")
a better approach IMO would be to parse it once and load the data to some database instead of that output_file (i.e mysql) and latter run the queries there

Python Memory error during function return statement

Hi i am processing a 600Mb file. i have written the below code. What i am doing was, to search for a keyword in the data between <dest> tags and if it exists then add a city tag to <dest> tag. It worked fine for small set of data but when i ran the program on large file it is throwing MEMORY ERROR. I guess i am getting this error when i use return statement in if condition can any one please let me know how to solve this?
import re
def casp ( tx ):
def tbcnv( st ):
ct = ''
prt = re.compile(r"(?i)(Slip Copy,.*?\))", re.DOTALL|re.M)
val = re.search(prt, st)
try:
ct = val.group(1)
if re.search(r"(?i)alaska", ct):
jval = "Alaska"
print jval
if jval:
prt = re.compile(r"(?i)(.*?<dest.*?>)", re.DOTALL|re.M)
vl = re.sub(prt, "\\1\n" + "<city>" + jval + "</city>" + "\n" ,st)
return vl
else:
return st
else:
return st
except:
print "Not available"
return st
pt = re.compile("(?i)(<dest.*?</dest>)", re.DOTALL|re.M)
t = re.sub(pt, lambda m: tbcnv(m.group(1)), tx)
return t
with open('input.txt', 'r') as content_file:
content = content_file.read()
pt = re.compile(r"(?i)<Lrlevel level='3'>(.*?)</Lrlevel>", re.DOTALL|re.M)
content = re.sub(pt,lambda m: "<Lrlevel level='3'>" + casp(m.group(1) + "</Lrlevel>" ), content)
with open('out.txt', 'w') as out_file:
out_file.write(content)
If you remove the return statement just before the expect, then the string built by re.sub() is much smaller.
I'm getting memory usage that is 3 times the file size, which means that you'd get a MemoryError if you don't have (more than) 2GB. This is reasonable here --- or at least I can guess why. It's how re.sub() works.
This means that you're using somehow the wrong tools, as explained in the comments above. You should either use a full xml-processing tool like lxml, or if you want to stick with regular expressions, find a way to never need the whole string in memory; or at least to never call re.sub() on it (e.g. only the tx variable ever contains a big string, which is the input; and you do pt.search(tx, startpos) in a loop, locating the places to change, and writing piece by piece parts of tx).

difflib python formatting

I am using this code to find difference between two csv list and hove some formatting questions. This is probably an easy fix, but I am new and trying to learn and having alot of problems.
import difflib
diff=difflib.ndiff(open('test1.csv',"rb").readlines(), open('test2.csv',"rb").readlines())
try:
while 1:
print diff.next(),
except:
pass
the code works fine and I get the output I am looking for as:
Group,Symbol,Total
- Adam,apple,3850
? ^
+ Adam,apple,2850
? ^
bob,orange,-45
bob,lemon,66
bob,appl,-56
bob,,88
My question is how do I clean the formatting up, can I make the Group,Symbol,Total into sperate columns, and the line up the text below?
Also can i change the ? to represent a text I determine? such as test 1 and test 2 representing which sheet it comes from?
thanks for any help
Using difflib.unified_diff gives much cleaner output, see below.
Also, both difflib.ndiff and difflib.unified_diff return a Differ object that is a generator object, which you can directly use in a for loop, and that knows when to quit, so you don't have to handle exceptions yourself. N.B; The comma after line is to prevent print from adding another newline.
import difflib
s1 = ['Adam,apple,3850\n', 'bob,orange,-45\n', 'bob,lemon,66\n',
'bob,appl,-56\n', 'bob,,88\n']
s2 = ['Adam,apple,2850\n', 'bob,orange,-45\n', 'bob,lemon,66\n',
'bob,appl,-56\n', 'bob,,88\n']
for line in difflib.unified_diff(s1, s2, fromfile='test1.csv',
tofile='test2.csv'):
print line,
This gives:
--- test1.csv
+++ test2.csv
## -1,4 +1,4 ##
-Adam,apple,3850
+Adam,apple,2850
bob,orange,-45
bob,lemon,66
bob,appl,-56
So you can clearly see which lines were changed between test1.csv and test1.csv.
To line up the columns, you must use string formatting.
E.g. print "%-20s %-20s %-20s" % (row[0],row[1],row[2]).
To change the ? into any text test you like, you'd use s.replace('any text i like').
Your problem has more to do with the CSV format, since difflib has no idea it's looking at columnar fields. What you need is to figure out into which field the guide is pointing, so that you can adjust it when printing the columns.
If your CSV files are simple, i.e. they don't contain any quoted fields with embedded commas or (shudder) newlines, you can just use split(',') to separate them into fields, and figure out where the guide points as follows:
def align(line, guideline):
"""
Figure out which field the guide (^) points to, and the offset within it.
E.g., if the guide points 3 chars into field 2, return (2, 3)
"""
fields = line.split(',')
guide = guideline.index('^')
f = p = 0
while p + len(fields[f]) < guide:
p += len(fields[f]) + 1 # +1 for the comma
f += 1
offset = guide - p
return f, offset
Now it's easy to show the guide properly. Let's say you want to align your columns by printing everything 12 spaces wide:
diff=difflib.ndiff(...)
for line in diff:
code = line[0] # The diff prefix
print code,
if code == '?':
fld, offset = align(lastline, line[2:])
for f in range(fld):
print "%-12s" % '',
print ' '*offset + '^'
else:
fields = line[2:].rstrip('\r\n').split(',')
for f in fields:
print "%-12s" % f,
print
lastline = line[2:]
Be warned that the only reliable way to parse CSV files is to use the csv module (or a robust alternative); but getting it to play well with the diff format (in full generality) would be a bit of a headache. If you're mainly interested in readability and your CSV isn't too gnarly, you can probably live with an occasional mix-up.

Should I be comparing bytes using struct?

I'm trying to compare the data within two files, and retrieve a list of offsets of where the differences are.
I tried it on some text files and it worked quite well..
However on non-text files (that still contain ascii text), I call them binary data files. (executables, so on..)
It seems to think some bytes are the same, even though when I look at it in hex editor, they are obviously not. I tried printing out this binary data that it thinks is the same and I get blank lines where it should be printed.
Thus, I think this is the source of the problem.
So what is the best way to compare bytes of data that could be both binary and contain ascii text? I thought using the struct module by be a starting point...
As you can see below, I compare the bytes with the == operator
Here's the code:
import os
import math
#file1 = 'file1.txt'
#file2 = 'file2.txt'
file1 = 'file1.exe'
file2 = 'file2.exe'
file1size = os.path.getsize(file1)
file2size = os.path.getsize(file2)
a = file1size - file2size
end = file1size #if they are both same size
if a > 0:
#file 2 is smallest
end = file2size
big = file1size
elif a < 0:
#file 1 is smallest
end = file1size
big = file2size
f1 = open(file1, 'rb')
f2 = open(file2, 'rb')
readSize = 500
r = readSize
off = 0
data = []
looking = False
d = open('data.txt', 'w')
while off < end:
f1.seek(off)
f2.seek(off)
b1, b2 = f1.read(r), f2.read(r)
same = b1 == b2
print ''
if same:
print 'Same at: '+str(off)
print 'readSize: '+str(r)
print b1
print b2
print ''
#save offsets of the section of "different" bytes
#data.append([diffOff, diffOff+off-1]) #[begin diff off, end diff off]
if looking:
d.write(str(diffOff)+" => "+str(diffOff+off-2)+"\n")
looking = False
r = readSize
off = off + 1
else:
off = off + r
else:
if r == 1:
looking = True
diffOff = off
off = off + 1 #continue reading 1 at a time, until u find a same reading
r = 1 #it will shoot back to the last off, since we didn't increment it here
d.close()
f1.close()
f2.close()
#add the diff ending portion to diff data offs, if 1 file is longer than the other
a = int(math.fabs(a)) #get abs val of diff
if a:
data.append([big-a, big-1])
print data
Did you try difflib and filecmp modules?
This module provides classes and
functions for comparing sequences. It
can be used for example, for comparing
files, and can produce difference
information in various formats,
including HTML and context and unified
diffs. For comparing directories and
files, see also, the filecmp module.
The filecmp module defines functions
to compare files and directories, with
various optional time/correctness
trade-offs. For comparing files, see
also the difflib module
.
You are probably encountering encoding/decoding problems. Someone may suggest a better solution, but you could try reading the file into a bytearray so you're reading raw bytes instead of decoded characters:
Here's a crude example:
$ od -Ax -tx1 /tmp/aa
000000 e0 b2 aa 0a
$ od -Ax -tx1 /tmp/bb
000000 e0 b2 bb 0a
$ cat /tmp/diff.py
a = bytearray(open('/tmp/aa', 'rb').read())
b = bytearray(open('/tmp/bb', 'rb').read())
print "%02x, %02x" % (a[2], a[3])
print "%02x, %02x" % (b[2], b[3])
$ python /tmp/diff.py
aa, 0a
bb, 0a

Categories