Read gz file and get the last 24 hrs line python

Read gz file and get the last 24 hrs line python - python

I have three file: 2 .gz files and 1 .log file. These files are pretty big. Below I have a sample copy of my original data. I want to extract the entries that correspond to the last 24 hours.
a.log.1.gz
2018/03/25-00:08:48.638553 508 7FF4A8F3D704 snononsonfvnosnovoosr
2018/03/25-10:08:48.985053 346K 7FE9D2D51706 ahelooa afoaona woom
2018/03/25-20:08:50.486601 1.5M 7FE9D3D41706 qojfcmqcacaeia
2018/03/25-24:08:50.980519 16K 7FE9BD1AF707 user: number is 93823004
2018/03/26-00:08:50.981908 1389 7FE9BDC2B707 user 7fb31ecfa700
2018/03/26-10:08:51.066967 0 7FE9BDC91700 Exit Status = 0x0
2018/03/26-15:08:51.066968 1 7FE9BDC91700 std:ZMD:
a.log.2.gz
2018/03/26-20:08:48.638553 508 7FF4A8F3D704 snononsonfvnosnovoosr
2018/03/26-24:08:48.985053 346K 7FE9D2D51706 ahelooa afoaona woom
2018/03/27-00:08:50.486601 1.5M 7FE9D3D41706 qojfcmqcacaeia
2018/03/27-10:08:50.980519 16K 7FE9BD1AF707 user: number is 93823004
2018/03/27-20:08:50.981908 1389 7FE9BDC2B707 user 7fb31ecfa700
2018/03/27-24:08:51.066967 0 7FE9BDC91700 Exit Status = 0x0
2018/03/28-00:08:51.066968 1 7FE9BDC91700 std:ZMD:
a.log
2018/03/28-10:08:48.638553 508 7FF4A8F3D704 snononsonfvnosnovoosr
2018/03/28-20:08:48.985053 346K 7FE9D2D51706 ahelooa afoaona woom
** Desired Result**
result.txt
2018/03/27-20:08:50.981908 1389 7FE9BDC2B707 user 7fb31ecfa700
2018/03/27-24:08:51.066967 0 7FE9BDC91700 Exit Status = 0x0
2018/03/28-00:08:51.066968 1 7FE9BDC91700 std:ZMD:
2018/03/28-10:08:48.638553 508 7FF4A8F3D704 snononsonfvnosnovoosr
2018/03/28-20:08:48.985053 346K 7FE9D2D51706 ahelooa afoaona woom
I am not sure how to get the entries that cover the last 24 hours.
And I want to run the below function on last 24 hours of data.
def _clean_logs(line):
# noinspection SpellCheckingInspection
lemmatizer = WordNetLemmatizer()
clean_line = clean_line.strip()
clean_line = clean_line.lstrip('0123456789.- ')
cleaned_log = " ".join(
[lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in nltk.word_tokenize(clean_line) if
word not in Stopwords.ENGLISH_STOP_WORDS and 2 < len(word) <= 30 and not word.startswith('_')])
cleaned_log = cleaned_log.replace('"', ' ')
return cleaned_log

Something like this should work.
from datetime import datetime, timedelta
import glob
import gzip
from pathlib import Path
import shutil
def open_file(path):
if Path(path).suffix == '.gz':
return gzip.open(path, mode='rt', encoding='utf-8')
else:
return open(path, encoding='utf-8')
def parsed_entries(lines):
for line in lines:
yield line.split(' ', maxsplit=1)
def earlier():
return (datetime.now() - timedelta(hours=24)).strftime('%Y/%m/%d-%H:%M:%S')
def get_files():
return ['a.log'] + list(reversed(sorted(glob.glob('a.log.*'))))
output = open('output.log', 'w', encoding='utf-8')
files = get_files()
cutoff = earlier()
for i, path in enumerate(files):
with open_file(path) as f:
lines = parsed_entries(f)
# Assumes that your files are not empty
date, line = next(lines)
if cutoff <= date:
# Skip files that can just be appended to the output later
continue
for date, line in lines:
if cutoff <= date:
# We've reached the first entry of our file that should be
# included
output.write(line)
break
# Copies from the current position to the end of the file
shutil.copyfileobj(f, output)
break
else:
# In case ALL the files are within the last 24 hours
i = len(files)
for path in reversed(files[:i]):
with open_file(path) as f:
# Assumes that your files have trailing newlines.
shutil.copyfileobj(f, output)
# Cleanup, it would get closed anyway when garbage collected or process exits.
output.close()
Then if we make some test log files:
#!/bin/sh
echo "2019/01/15-00:00:00.000000 hi" > a.log.1
echo "2019/01/31-00:00:00.000000 hi2" > a.log.2
echo "2019/01/31-19:00:00.000000 hi3" > a.log
gzip a.log.1 a.log.2
and run our script, it outputs the expected result (for this point in time)
2019/01/31-00:00:00.000000 hi2
2019/01/31-19:00:00.000000 hi3

Working with log files often involves pretty large amounts of data, thus reading in ascending order and reading everything everytime is not desired since it wastes a lot of resources.
The fastest way to accomplish your goal that immediately came to my mind (better approaches will certainly exist) is a very simple random search: we search through the logfile(s) in reversed order, thus beginning with the newest first. Instead of visiting all lines, you just arbitrarily choose some stepsize and only look at some lines each stepsize. This way, you can search through Gigabytes of data in a very short time.
Additionally, this approach does not require to store each line of a file in memory, but only some lines and the final result.
When a.log is the current log file, we begin searching here:
with open("a.log", "rb+") as fh:
Since we are only interested in the last 24 hours, we jump to the end first and save the timestamp to search for as a formatted string:
timestamp = datetime.datetime.now() - datetime.timedelta(days=1) # last 24h
# jump to logfile's end
fh.seek(0, 2) # <-- '2': search relative to file's end
index = fh.tell() # current position in file; here: logfile's *last* byte
Now we can begin our random search. Your line appear to be about 65 characters long on average, hence we move a multiple of that.
average_line_length = 65
stepsize = 1000
while True:
# we move a step back
fh.seek(index - average_line_length * stepsize, 2)
# save our current position in file
index = fh.tell()
# we try to read a "line" (multiply avg. line length times a number
# large enough to cover even large lines. Ignore largest lines here,
# since this is an edge cases ruining our runtime. We rather skip
# one iteration of the loop then)
r = fh.read(average_line_length * 10)
# our results now contains (on average) multiple lines, so we
# split first
lines = r.split(b"\n")
# now we check for our timestring
for l in lines:
# your timestamps are formatted like '2018/03/28-20:08:48.985053'
# I ignore minutes, seconds, ... here, just for the sake of simplicity
timestr = l.split(b":") # this gives us b'2018/03/28-20' in timestr[0]
# next we convert this to a datetime
found_time = datetime.datetime.strptime(timestr[0], "%Y/%m/%d-%H")
# finally, we compare if the found time is not inside our 24hour margin
if found_time < timestamp:
break
With this code we will only end up searching a few lines each stepsize (here: 1000 lines) as long as we are inside our last 24 hours. Once we left the 24 hours, we know that at most we went exactly stepsize * average_line_length too far up in the file.
Filtering this "went too far" becomes very easy then:
# read in file's contents from current position to end
contents = fh.read()
# split for lines
lines_of_contents = contents.split(b"\n")
# helper function for removing all lines older than 24 hours
def check_line(line):
# split to extract datestr
tstr = line.split(b":")
# convert this to a datetime
ftime = datetime.datetime.strptime(tstr[0], "%Y/%m/%d-%H")
return ftime > timestamp
# remove all lines that are older than 24 hours
final_result = filter(check_line, lines_of_contents)
Since contents covers all of the remaining contents of our file (and lines all lines, which is simply contents split at linebreaks \n) we can easily use filter to get our desired result.
Each line in lines will be fed to check_line, which returns True if the line's time is > timestamp and timestamp is our datetime object describing exactly now - 1day. This means that check_line will return False for all lines older than timestamp and filter will remove those lines.
Obviously, this is far from optimal, but it is easy to understand and easily extendable to filtering for minutes, seconds, ...
Additionally, covering multiple files is also easy: you just need glob.glob to find all possible files, start with the newest file and add another loop: you would search through the files until our while loop fails for the first time, then break and read all remaining contents from the current file + all contents from all files visited before.
Roughly, something like this:
final_lines = lst()
for file in logfiles:
# our while-loop
while True:
...
# if while-loop did not break all of the current logfile's content is
# <24 hours of age
with open(file, "rb+") as fh:
final_lines.extend(fh.readlines())
This way you simply store all lines of a logfile, if all lines are <24 hours of age. If the loop breaks at some point, i.e. we have found a logfile and the exact line >24 hours of age, extend final_lines by final_result since this will cover only the lines <24 hours of age.

Related

How to search if a string is in a very large file in Python

I have a text file of 100GB containing 100 billion digits of pi, and I need a fast way to search if a 21 digit number is anywhere in this file. Note that the whole file is a single line so no linebreak. I have this function that uses a large buffer (500mb) to load parts of the file and check if the number is there:
def fnd(s):
start = 2
with open("pi_dec_1t_01.txt", 'r') as f:
fsize = os.path.getsize("pi_dec_1t_01.txt")
bsize = 536870912
buffer = None
if start > 0:
f.seek(start)
overlap = len(s) - 1
while True:
if (f.tell() >= overlap and f.tell() < fsize):
f.seek(f.tell() - overlap)
buffer = f.read(bsize)
if buffer:
pos = buffer.find(s)
if pos >= 0:
return f.tell() - (len(buffer) - pos)
else:
return -1
It is fast if I wanted to search only one of these numbers, but I need to search up to 2 billion (until I find one), which would literally take centuries. Any time efficient way to do this? Even if I needed to use some other language or platform

You could examine this package and maybe find more info on the algorithm it implements: https://pyahocorasick.readthedocs.io/en/latest/

Iterating Over Two Large Lists using Python

I have two files both of which are tab delimited. One of the file is almost 800k lines and it is a An Exonic Coordinates file and the other file is almost 200k lines (It is a VCF File).
I am writing a code in python to find and filter the position in the VCF that is within an exonic coordinates (Exon Start and End from Exonic Coordinates File) and writes it to a file.
However, because the files are big, it took a couple of days to get the filtrated output file?
So the code below is partially solve the issue of speed but the problem is to figure out is to speed the filtration process which is why I used a break to exit the second loop and I want to start from the beginning of the outer loop instead taking the next element from the first loop (outer loop)?
Here is my code:
import
import sys
list_coord = []
with open('ref_ordered.txt', 'rb') as csvfile:
reader = csv.reader(csvfile, delimiter='\t')
for row in reader:
list_coord.append((row[0],row[1],row[2]))
def parseVcf(vcf,src):
done = False
with open(vcf,'r') as f:
reader=csv.reader((f),delimiter='\t')
vcf_out_split = vcf.split('.')
vcf_out_split.insert(2,"output_CORRECT2")
outpt = open('.'.join(vcf_out_split),'a')
for coord in list_coord:
for row in reader:
if '#' not in row[0]:
coor_genom = int(row[1])
coor_exon1 = int(coord[1])+1
coor_exon2 = int(coord[2])
coor_genom_chr = row[0]
coor_exon_chr = coord[0]
ComH = row[7].split(';')
for x in ComH:
if 'DP4=' in x:
DP4_split=x[4:].split(',')
if (coor_exon1 <= coor_genom <= coor_exon2):
if (coor_genom_chr == coor_exon_chr):
if ((int(DP4_split[2]) >= 1 and int(DP4_split[3]) >= 1)):
done = True
outpt.write('\t'.join(row) + '\n')
if done:
break
outpt.close()
for root,dirs,files in os.walk("."):
for file in files:
pathname=os.path.join(root,file)
if file.find("1_1")==0:
print "Parsing " + file
parseVcf(pathname, "1_1")
ref_ordered.txt:
1 69090 70008
1 367658 368597
1 621095 622034
1 861321 861393
1 865534 865716
1 866418 866469
1 871151 871276
1 874419 874509
1_1 Input File:
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT directory
1 14907 rs79585140 A G 20 . DP=10;VDB=5.226464e-02;RPB=-6.206015e-01;AF1=0.5;AC1=1;DP4=1,2,5,2;MQ=32;FQ=20.5;PV4=0.5,0.07,0.16,0.33;DN=131;DA=A/G;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=intron-variant;CP=0.001;CG=-0.312;CADD=1.415;AA=A;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DV=by-frequency,by-cluster;DSP=61 GT:PL:GQ 0/1:50,0,51:50
1 14930 rs75454623 A G 44 . DP=9;VDB=7.907652e-02;RPB=3.960091e-01;AF1=0.5;AC1=1;DP4=1,2,6,0;MQ=41;FQ=30.9;PV4=0.083,1,0.085,1;DN=131;DA=A/G;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=intron-variant;CP=0.000;CG=-1.440;CADD=1.241;AA=A;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DV=by-frequency,by-cluster;DSP=38 GT:PL:GQ 0/1:74,0,58:61
1 15211 rs78601809 T G 9.33 . DP=6;VDB=9.014600e-02;RPB=-8.217058e-01;AF1=1;AC1=2;DP4=1,0,3,2;MQ=21;FQ=-37;PV4=1,0.35,1,1;DN=131;DA=T/G;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=intron-variant;CP=0.001;CG=-0.145;CADD=1.611;AA=T;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DV=by-frequency,by-cluster;DSP=171 GT:PL:GQ 1/1:41,10,0:13
1 16146 . A C 25 . DP=10;VDB=2.063840e-02;RPB=-2.186229e+00;AF1=0.5;AC1=1;DP4=7,0,3,0;MQ=39;FQ=27.8;PV4=1,0.0029,1,0.0086;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=unknown;CP=0.001;CG=-0.555;CADD=2.158;AA=A;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DSP=197 GT:PL:GQ 0/1:55,0,68:58
1 16257 rs78588380 G C 40 . DP=18;VDB=9.421102e-03;RPB=-1.327486e+00;AF1=0.5;AC1=1;DP4=3,11,4,0;MQ=50;FQ=43;PV4=0.011,1,1,1;DN=131;DA=G/C;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=intron-variant;CP=0.001;CG=-2.500;CADD=0.359;AA=G;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DSP=308 GT:PL:GQ 0/1:70,0,249:73
1 16378 rs148220436 T C 39 . DP=7;VDB=2.063840e-02;RPB=-9.980746e-01;AF1=0.5;AC1=1;DP4=0,4,0,3;MQ=50;FQ=42;PV4=1,0.45,1,1;DN=134;DA=T/C;GM=NR_024540.1;GL=WASH7P;FG=intron;FD=intron-variant;CP=0.016;CG=-2.880;CADD=0.699;AA=T;CN=dgv1e1,dgv2n71,dgv3e1,esv27265,nsv428112,nsv7879;DV=by-cluster;DSP=227 GT:PL:GQ 0/1:69,0,90:72
OUTPUT File:
1 877831 rs6672356 T C 44.8 . DP=2;VDB=6.720000e-02;AF1=1;AC1=2;DP4=0,0,1,1;MQ=50;FQ=-33;DN=116;DA=T/C;GM=NM_152486.2,XM_005244723.1,XM_005244724.1,XM_005244725.1,XM_005244726.1,XM_005244727.1;GL=SAMD11;FG=missense,missense,missense,missense,missense,intron;FD=unknown;AAC=TRP/ARG,TRP/ARG,TRP/ARG,TRP/ARG,TRP/ARG,none;PP=343/682,343/715,328/667,327/666,234/573,NA;CDP=1027,1027,982,979,700,NA;GS=101,101,101,101,101,NA;PH=0;CP=0.994;CG=2.510;CADD=0.132;AA=C;CN=dgv10n71,dgv2n67,dgv3e1,dgv8n71,dgv9n71,essv2408,essv4734,nsv10161,nsv428334,nsv509035,nsv517709,nsv832980,nsv871547,nsv871883;DG;DV=by-cluster,by-1000G;DSP=38;CPG=875731-878363;GESP=C:8470/T:0;PAC=NP_689699.2,XP_005244780.1,XP_005244781.1,XP_005244782.1,XP_005244783.1,NA GT:PL:GQ 1/1:76,6,0:10
1 878000 . C T 44.8 . DP=2;VDB=7.520000e-02;AF1=1;AC1=2;DP4=0,0,1,1;MQ=50;FQ=-33;GM=NM_152486.2,XM_005244723.1,XM_005244724.1,XM_005244725.1,XM_005244726.1,XM_005244727.1;GL=SAMD11;FG=synonymous,synonymous,synonymous,synonymous,synonymous,intron;FD=unknown;AAC=LEU,LEU,LEU,LEU,LEU,none;PP=376/682,376/715,361/667,360/666,267/573,NA;CDP=1126,1126,1081,1078,799,NA;CP=0.986;CG=3.890;CADD=2.735;AA=C;CN=dgv10n71,dgv2n67,dgv3e1,dgv8n71,dgv9n71,essv2408,essv4734,nsv10161,nsv428334,nsv509035,nsv517709,nsv832980,nsv871547,nsv871883;DSP=62;CPG=875731-878363;PAC=NP_689699.2,XP_005244780.1,XP_005244781.1,XP_005244782.1,XP_005244783.1,NA GT:PL:GQ 1/1:76,6,0:10
1 881627 rs2272757 G A 205 . DP=9;VDB=1.301207e-01;AF1=1;AC1=2;DP4=0,0,5,4;MQ=50;FQ=-54;DN=100;DA=G/A;GM=NM_015658.3,XM_005244739.1;GL=NOC2L;FG=synonymous;FD=synonymous-codon,unknown;AAC=LEU;PP=615/750,615/755;CDP=1843;CP=0.082;CG=5.170;CADD=0.335;AA=G;CN=dgv10n71,dgv2n67,dgv3e1,dgv8n71,dgv9n71,essv2408,essv4734,nsv10161,nsv428334,nsv509035,nsv517709,nsv832980,nsv871547,nsv871883;DG;DV=by-frequency,by-cluster,by-1000G;DSP=40;GESP=A:6174/G:6830;PAC=NP_056473.2,XP_005244796.1 GT:PL:GQ 1/1:238,27,0:51

First of all, I did not include any code because it looks like homework to me (I have had homework like this). I will however try to explain the steps I took to improve my scripts, even though I know my solutions are far from perfect.
your script could be slow because for every line in your csv file you open, write and close your output file. Try to make a list of lines you want to add to the output file, and after you are done with reading and filtering, then start writing.
You also might want to consider to write functions per filter and call these functions with the line as variable. That way you can easily add filters later on. I use a counter to keep track of the amount of succeeded filters and if in the end counter == len(amountOfUsedFilers) I add my line to the list.
Also, why do you use outpt = open('.'.join(vcf_out_split),'a') and with open(vcf,'r') as f: try to be consistent and smart in your choices.
Bioinformatics for the win!

If both of your files are ordered, you can save a lot of time by iterating over them in parallel, always advancing the one with lowest coordinates. This way you will only handle each line once, not many times.
Here's a basic version of your code that only does the coordinate checking (I don't fully understand your DP4 condition, so I'll leave it to you to add that part back in):
with open(coords_fn) as coords_f, open(vcf_fn) as vcf_f, open(out_fn) as out_f:
coords = csv.reader(coords_f, delimiter="\t")
vcf = csv.reader(vcf_f, delimiter="\t")
out = csv.writer(out_f, delimiter="\t")
next(vcf) # discard header row, or use out.writeline(next(vcf)) to preserve it!
try:
c = next(coords)
r = next(vcf)
while True:
if int(c[1]) >= int(r[1]): # vcf file is behind
r = next(vcf)
elif int(c[2]) < int(r[1]): # coords file is behind
c = next(coords)
else: # int(c[1]) < int(r[1]) <= int(c[2])
out.writeline(r) # add DP4 check here, and indent this line under it
r = next(vcf) # don't indent this line
except StopIteration: # one of the files has ended
pass

Python - read 10min from log file

I need some tool to read latest 10 minutes entry in my log file, and if some words are logged then print some text.
log file:
23.07.2014 09:22:11 INFO Logging.LogEvent 0 Failed login test#test.com
23.07.2014 09:29:02 INFO Logging.LogEvent 0 login test#test.com
23.07.2014 09:31:55 INFO Logging.LogEvent 0 login test#test.com
23.07.2014 09:44:14 INFO Logging.LogEvent 0 Failed login test#test.com
if during last 10min some entry = Failed -print ALARM.
All what i did is find 'Failed' match but i have no idea how to check last 10min in my log file ;/ -any idea??
from sys import argv
from datetime import datetime, timedelta
with open('log_test.log', 'r') as f:
for line in f:
try:
e = line.index("Failed")
except:
pass
else:
print(line)

Your format %d.%m.%Y is worse than %Y:%m:%d which can be used in string comparison.
We also do not know if log is big and if it is sorted. If it is not sorted (it is common for multithreaded applications) you will have to analyze each line and convert it into datetime:
def get_dt_from_line(s):
return datetime.datetime.strptime(s[:20], '%d.%m.%Y %H:%M:%S')
Then use it as filter (for small files):
MAX_CHECK_TIMEDELTA = datetime.timedelta(minutes=10)
LOG_START_ANALYZE_DATETIME = (datetime.datetime.today() - MAX_CHECK_TIMEDELTA)
lines = [s for s in TXT.split('\n') if 'Failed' in s and get_dt_from_line(s) >= LOG_START_ANALYZE_DATETIME]
print('\n'.join(lines))
For big files you can read file line by line.
If your log file is just for one day you can use string comparison instead of datetime comparison:
LOG_START_ANALYZE_DATETIME = (datetime.datetime.today() - datetime.timedelta(minutes=10)).strftime('%d.%m.%Y %H:%M:%S')
lines = [s for s in TXT.split('\n') if 'Failed' in s and s >= LOG_START_ANALYZE_DATETIME]

If I were you, I would lookup line by line, get the timestamp of the first line and then iterate until the difference between the first date and the current one is more than 10 minutes, while counting occurences of the word "Failed".
I think that you'll sort something out with splitting your line following spaces. But be careful as if someday, your log format changes, your script is likely not gonna be working too.

how to create an index to parse big text file

I have two files A and B in FASTQ format, which are basically several hundred million lines of text organized in groups of 4 lines starting with an # as follows:
#120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
GCCAATGGCATGGTTTCATGGATGTTAGCAGAAGACATGAGACTTCTGGGACAGGAGCAAAACACTTCATGATGGCAAAAGATCGGAAGAGCACACGTCTGAACTCN
+120412_SN549_0058_BD0UMKACXX:5:1101:1156:2031#0/1
bbbeee_[_ccdccegeeghhiiehghifhfhhhiiihhfhghigbeffeefddd]aegggdffhfhhihbghhdfffgdb^beeabcccabbcb`ccacacbbccB
I need to compare the
5:1101:1156:2031#0/
part between files A and B and write the groups of 4 lines in file B that matched to a new file. I got a piece of code in python that does that, but only works for small files as it parses through the entire #-lines of file B for every #-line in file A, and both files contain hundreds of millions of lines.
Someone suggested that I should create an index for file B; I have googled around without success and would be very grateful if someone could point out how to do this or let me know of a tutorial so I can learn. Thanks.
==EDIT==
In theory each group of 4 lines should only exist once in each file. Would it increase the speed enough if breaking the parsing after each match or do I need a different algorithm altogether?

An index is just a shortened version of the information you are working with. In this case, you will want the "key" - the text between the first colon(':') on the #-line and the final slash('/') near the end - as well as some kind of value.
Since the "value" in this case is the entire contents of the 4-line block, and since our index is going to store a separate entry for each block, we would be storing the entire file in memory if we used the actual value in the index.
Instead, let's use the file position of the beginning of the 4-line block. That way, you can move to that file position, print 4 lines, and stop. Total cost is the 4 or 8 or however many bytes it takes to store an integer file position, instead of however-many bytes of actual genome data.
Here is some code that does the job, but also does a lot of validation and checking. You might want to throw stuff away that you don't use.
import sys
def build_index(path):
index = {}
for key, pos, data in parse_fastq(path):
if key not in index:
# Don't overwrite duplicates- use first occurrence.
index[key] = pos
return index
def error(s):
sys.stderr.write(s + "\n")
def extract_key(s):
# This much is fairly constant:
assert(s.startswith('#'))
(machine_name, rest) = s.split(':', 1)
# Per wikipedia, this changes in different variants of FASTQ format:
(key, rest) = rest.split('/', 1)
return key
def parse_fastq(path):
"""
Parse the 4-line FASTQ groups in path.
Validate the contents, somewhat.
"""
f = open(path)
i = 0
# Note: iterating a file is incompatible with fh.tell(). Fake it.
pos = offset = 0
for line in f:
offset += len(line)
lx = i % 4
i += 1
if lx == 0: # #machine: key
key = extract_key(line)
len1 = len2 = 0
data = [ line ]
elif lx == 1:
data.append(line)
len1 = len(line)
elif lx == 2: # +machine: key or something
assert(line.startswith('+'))
data.append(line)
else: # lx == 3 : quality data
data.append(line)
len2 = len(line)
if len2 != len1:
error("Data length mismatch at line "
+ str(i-2)
+ " (len: " + str(len1) + ") and line "
+ str(i)
+ " (len: " + str(len2) + ")\n")
#print "Yielding #%i: %s" % (pos, key)
yield key, pos, data
pos = offset
if i % 4 != 0:
error("EOF encountered in mid-record at line " + str(i));
def match_records(path, index):
results = []
for key, pos, d in parse_fastq(path):
if key in index:
# found a match!
results.append(key)
return results
def write_matches(inpath, matches, outpath):
rf = open(inpath)
wf = open(outpath, 'w')
for m in matches:
rf.seek(m)
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
wf.write(rf.readline())
rf.close()
wf.close()
#import pdb; pdb.set_trace()
index = build_index('afile.fastq')
matches = match_records('bfile.fastq', index)
posns = [ index[k] for k in matches ]
write_matches('afile.fastq', posns, 'outfile.fastq')
Note that this code goes back to the first file to get the blocks of data. If your data is identical between files, you would be able to copy the block from the second file when a match occurs.
Note also that depending on what you are trying to extract, you may want to change the order of the output blocks, and you may want to make sure that the keys are unique, or perhaps make sure the keys are not unique but are repeated in the order they match. That's up to you - I'm not sure what you're doing with the data.

these guys claim to parse a few gigs file while using a dedicated library, see http://www.biostars.org/p/15113/
fastq_parser = SeqIO.parse(fastq_filename, "fastq")
wanted = (rec for rec in fastq_parser if ...)
SeqIO.write(wanted, output_file, "fastq")
a better approach IMO would be to parse it once and load the data to some database instead of that output_file (i.e mysql) and latter run the queries there

Conversion of Multiple Strings To ASCII

This seems fairly trivial but I can't seem to work it out
I have a text file with the contents:
B>F
I am reading this with the code below, stripping the '>' and trying to convert the strings into their corresponding ASCII value, minus 65 to give me a value that will correspond to another list index
def readRoute():
routeFile = open('route.txt', 'r')
for line in routeFile.readlines():
route = line.strip('\n' '\r')
route = line.split('>')
#startNode, endNode = route
startNode = ord(route[0])-65
endNode = ord(route[1])-65
# Debug (this comment was for my use to explain below the print values)
print 'Route Entered:'
print line
print startNode, ',', endNode, '\n'
return[startNode, endNode]
However I am having slight trouble doing the conversion nicely, because the text file only contains one line at the moment but ideally I need it to be able to support more than one line and run an amount of code for each line.
For example it could contain:
B>F
A>D
C>F
E>D
So I would want to run the same code outside this function 4 times with the different inputs
Anyone able to give me a hand
Edit:
Not sure I made my issue that clear, sorry
What I need it do it parse the text file (possibly containing one line or multiple lines like above. I am able to do it for one line with the lines
startNode = ord(route[0])-65
endNode = ord(route[1])-65
But I get errors when trying to do more than one line because the ord() is expecting different inputs
If I have (below) in the route.txt
B>F
A>D
This is the error it gives me:
line 43, in readRoute endNode = ord(route[1])-65
TypeError: ord() expected a character, but string of length 2 found
My code above should read the route.txt file and see that B>F is the first route, strip the '>' - convert the B & F to ASCII, so 66 & 70 respectively then minus 65 from both to give 1 & 5 (in this example)
The 1 & 5 are corresponding indexes for another "array" (list of lists) to do computations and other things on
Once the other code has completed it can then go to the next line in route.txt which could be A>D and perform the above again

Perhaps this will work for you. I turned the fileread into a generator so you can do as you please with the parsed results in the for-i loop.
def readRoute(file_name):
with open(file_name, 'r') as r:
for line in r:
yield (ord(line[0])-65, ord(line[2])-65)
filename = 'route.txt'
for startnode, endnode in readRoute(filename):
print startnode, endnode

If you can't change readRoute, change the contents of the file before each call. Better yet, make readRoute take the filename as a parameter (default it to 'route.txt' to preserve the current behavior) so you can have it process other files.

What about something like this? It takes the routes defined in your file and turns them into path objects with start and end member variables. As an added bonus PathManager.readFile() allows you to load multiple route files without overwriting the existing paths.
import re
class Path:
def __init__(self, start, end):
self.start = ord(start) - 65 # Scale the values as desired
self.end = ord(end) - 65 # Scale the values as desired
class PathManager:
def __init__(self):
self.expr = re.compile("^([A-Za-z])[>]([A-Za-z])$") # looks for string "C>C"
# where C is a char
self.paths = []
def do_logic_routine(self, start, end):
# Do custom logic here that will execute before the next line is read
# Return True for 'continue reading' or False to stop parsing file
return True
def readFile(self, path):
file = open(path,"r")
for line in file:
item = self.expr.match(line.strip()) # strip whitespaces before parsing
if item:
'''
item.group(0) is *not* used here; it matches the whole expression
item.group(1) matches the first parenthesis in the regular expression
item.group(2) matches the second
'''
self.paths.append(Path(item.group(1), item.group(2)))
if not do_logic_routine(self.paths[-1].start, self.paths[-1].end):
break
# Running the example
MyManager = PathManager()
MyManager.readFile('route.txt')
for path in MyManager.paths:
print "Start: %s End: %s" % (path.start, path.end)
Output is:
Start: 1 End: 5
Start: 0 End: 3
Start: 2 End: 5
Start: 4 End: 3

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Read gz file and get the last 24 hrs line python - python

Related

How to search if a string is in a very large file in Python

Iterating Over Two Large Lists using Python

Python - read 10min from log file

how to create an index to parse big text file

Conversion of Multiple Strings To ASCII

Categories

Resources