Optimizing Permutation Search in File - python

I wrote a program that, using a set of characters, searches in a file for any of the permutations of that set. I would love your suggestions for optimizing that program.
I split the file into multiple, depending on the number of characters of each "word":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import itertools
import time
start_time = time.time()
chrset = sys.argv[1]
lgr = len(chrset)
f = open('gd'+str(lgr), 'r')
perms = []
s = list(itertools.permutations(chrset))
for perm in s:
perms.append(''.join(map(str,perm)))
for line in f:
line = line.rstrip('\n')
for pp in perms:
if pp == line:
print (line)
print("--- %s seconds ---" % (time.time() - start_time))

Related

Why is my progress bar not progressing? Is it in a loop?

I'm trying to make a progress bar so that as the lines are processed the bar goes up. I can't get it to work.
import io
from datetime import datetime
from alive_progress import alive_bar
result = io.open("Edificado/edificadoResultadoSinPorBlancos.txt","w",encoding='utf-8',errors="surrogateescape")
start_time = datetime.now()
print(f"Empece en: {start_time}")
with io.open("Edificado/edificco.txt","r",encoding='utf-8',errors="surrogateescape" ) as f:
data = len(f.readlines())
with alive_bar(len(data)) as bar:
for line in f:
if '|' in line:
line = line.replace("|","-")
result.write(line)
bar()
result.close()
end_time = datetime.now()
print('Duracion: {}'.format(end_time - start_time))
You can't read the file conten twice, one with f.readlines() and one with for line in f because the file descriptor is at the end
Do
start_time = datetime.now()
print(f"Empece en: {start_time}")
with io.open("Edificado/edificco.txt","r",encoding='utf-8',errors="surrogateescape" ) as f,
io.open("Edificado/edificadoResultadoSinPorBlancos.txt","w",encoding='utf-8',errors="surrogateescape") as result:
lines = f.readlines()
with alive_bar(len(lines)) as bar:
for line in lines:
result.write(line.replace("|","-"))
bar()
Also
you were calling len twice, that couldn't work
don't need to check for a character presence to replace it (about |)

Is there a method to print the duration of the script and save it inside the csv?

I would like to print the duration of the script and save it inside the csv. What would be the best method?
import requests
with open('infile.txt', 'r') as f:
urls = f.readlines()
datalist=[]
for url in urls:
data = requests.get(url)
datalist.append(data.text)
with open('outfile.txt', 'w') as f:
for item in datalist:
f.write("%s\n" % item)
You can use datetime module.
import requests
from datetime import datetime
def run():
with open('infile.txt', 'r') as f:
urls = f.readlines()
datalist=[]
for url in urls:
data = requests.get(url)
datalist.append(data.text)
with open('outfile.txt', 'w') as f:
for item in datalist:
f.write("%s\n" % item)
startTime = datetime.now()
run()
print(datetime.now() - startTime)
One simple way you could do this is by using the built-in time module. Get the time before the execution, get the time after the execution, subtract them and you get the time taken for your script to run. You could then just export it to your csv just like every other text.
import time
# STEP 1: Get start time
start_time = time.time()
# Step 2: Run your code you want to time
result = func_one(some_param)
# Step 3: Calculate total time elapsed
end_time = time.time() - start_time

Changing an orientation of a reverse sequence in the fasta file doesnt work

I am trying to get the reverse sequences orientated correctly in a file. This is the code:
import os
import sys import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
SeqIO.write(bam_to_rec(in_file), out_handle, "fasta")
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])`
When I print out the reverse sequences, the code works. But when in the file it is printed out as a reverse sequence. Can anyone help me to find out what is going wrong?
Here is the link to my infile:
https://www.dropbox.com/sh/68ui8l7nh5fxatm/AABUr82l01qT1nL8I_XgJaeTa?dl=0
Note the ugly counter is just to print 10000 sequences, not more.
comparing one without ever reversing with one that reverses if needed
Here's the output on a couple of seqs, feel free to test it, I think your issue is that yield returns an iterator but you are not iterating it, unless I am missunderstanding what you are doing:
Original:
SOLEXA-1GA-2:2:93:1281:961#0
GGGTTAGGTTAGGGTTAGGGTTAGGGTTAGGGTTAG
Becomes:
SOLEXA-1GA-2:2:93:1281:961#0
CTAACCCTAACCCTAACCCTAACCCTAACCTAACCC
And if not reverse:
Original:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Becomes:
SOLEXA-1GA-2:2:12:96:1547#0
ACACACAAACACACACACACACACACACACACCCCC
Here's my code:
import os
import sys
import pysam
from Bio import SeqIO, Seq, SeqRecord
def main(in_file):
out_file = "%s.fa" % os.path.splitext(in_file)[0]
with open('test_non_reverse.txt', 'w') as non_reverse:
with open(out_file, "w") as out_handle:
# Write records from the BAM file one at a time to the output file.
# Works lazily as BAM sequences are read so will handle large files.
i = 0
for s in bam_to_rec(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, out_handle, "fasta")
i = 0
for s in convert_to_seq(in_file):
if i == 10000:
break
i +=1
SeqIO.write(s, non_reverse, 'fasta')
def convert_to_seq(in_file):
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
def bam_to_rec(in_file):
"""Generator to convert BAM files into Biopython SeqRecords.
"""
bam_file = pysam.Samfile(in_file, "rb")
for read in bam_file:
seq = Seq.Seq(read.seq)
if read.is_reverse:
seq = seq.reverse_complement()
rec = SeqRecord.SeqRecord(seq, read.qname, "", "")
yield rec
if __name__ == "__main__":
main(*sys.argv[1:])

Python: How to print values with carriage return on Terminal

When i run the below script using PyCharm, no newline is printed during the download process.
However when i run this script on Terminal, newlines are printed.
Here are the codes:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import ssl
import urllib.request
def UrlOpen(url, filename):
data = urllib.request.urlopen(url=url, context=ssl.SSLContext(ssl.PROTOCOL_SSLv23))
size = int(data.headers.get("content-length"))
with open(filename, "wb") as f:
count = 0
now = time.time()
for i in data:
count += len(i)
f.write(i)
print("\rSize: {} b, Downloaded: {} b, Completed: {} %, Elapsed: {} s, Speed: {} b/s"
.format(size,
count,
int(100 * count / size),
int(time.time() - now),
count / (time.time() - now)), flush=True, end="")
if __name__ == "__main__":
test_url = "http://muhendislik.istanbul.edu.tr/insaat/wp-content/uploads/2015/10/temel_in%C5%9Faat%C4%B1.pdf"
file_path = "/home/tanberk/Masaüstü/test.pdf"
UrlOpen(test_url, file_path)
How can i print a single line when i use this script on Terminal? Thanks in advance.
\r is an old style Mac line terminator or carriage return. Did you mean \n for a newline?
print always includes a newline. You have to use sys.stdout.write to exclude it, or use the end= keyword argument to the function version of print from future.

How to speed up process of loading and reading JSON files in Python?

I am running a script (in multiprocessing mode) that extract some parameters from a bunch of JSON files but currently it is very slow. Here is the script:
from __future__ import print_function, division
import os
from glob import glob
from os import getpid
from time import time
from sys import stdout
import resource
from multiprocessing import Pool
import subprocess
try:
import simplejson as json
except ImportError:
import json
path = '/data/data//*.A.1'
print("Running with PID: %d" % getpid())
def process_file(file):
start = time()
filename =file.split('/')[-1]
print(file)
with open('/data/data/A.1/%s_DI' %filename, 'w') as w:
with open(file, 'r') as f:
for n, line in enumerate(f):
d = json.loads(line)
try:
domain = d['rrname']
ips = d['rdata']
for i in ips:
print("%s|%s" % (i, domain), file=w)
except:
print (d)
pass
if __name__ == "__main__":
files_list = glob(path)
cores = 12
print("Using %d cores" % cores)
pp = Pool(processes=cores)
pp.imap_unordered(process_file, files_list)
pp.close()
pp.join()
Does any body know how to speed this up?
swith from
import json
to
import ujson
https://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/
or switch to orjson
import orjson
https://github.com/ijl/orjson
First, find out where your bottlenecks are.
If it is on the json decoding/encoding step, try switching to ultrajson:
UltraJSON is an ultra fast JSON encoder and decoder written in pure C
with bindings for Python 2.5+ and 3.
The changes would be as simple as changing the import part:
try:
import ujson as json
except ImportError:
try:
import simplejson as json
except ImportError:
import json
I've also done a simple benchmark at What is faster - Loading a pickled dictionary object or Loading a JSON file - to a dictionary?, take a look.
I updated the script a bit to try different experiments and found that yes, json parsing is cpu bound. I got 28MB/s, which is better than your .04Gig per minute (> 1 MB/s), so not sure what's going on there. When skipping the json stuff and just writing to the file, I got 996 MB/s.
In the code below, you can generate a dataset with python slow.py create and test several scenarios by changing the code marked todo:. My dataset was only 800 MB, so I/O was absorbed by the RAM cache (run it twice to make sure that the files to read have been cached).
I was surprised that json decode is so cpu intensive.
from __future__ import print_function, division
import os
from glob import glob
from os import getpid
from time import time
from sys import stdout
import resource
from multiprocessing import Pool, cpu_count
import subprocess
# todo: pick your poison
#import json
#import ujson as json
import simplejson as json
import sys
# todo: choose your data path
#path = '/data/data//*.A.1'
#path = '/tmp/mytest'
path = os.path.expanduser('~/tmp/mytest')
# todo: choose your cores
#cores = 12
cores = cpu_count()
print("Running with PID: %d" % getpid())
def process_file(file):
start = time()
filename =file.split('/')[-1]
print(file)
with open(file + '.out', 'w', buffering=1024*1024) as w:
with open(file, 'r', buffering=1024*1024) as f:
for n, line in enumerate(f):
# todo: for pure bandwidth calculations
#w.write(line)
#continue
try:
d = json.loads(line)
except Exception, e:
raise RuntimeError("'%s' in %s: %s" % (str(e), file, line))
try:
domain = d['rrname']
ips = d['rdata']
for i in ips:
print("%s|%s" % (i, domain), file=w)
except:
print (d, 'error')
pass
return os.stat(file).st_size
def create_files(path, files, entries):
print('creating files')
extra = [i for i in range(32)]
if not os.path.exists(path):
os.makedirs(path)
for i in range(files):
fn = os.path.join(path, 'in%d.json' % i)
print(fn)
with open(fn, 'w') as fp:
for j in range(entries):
json.dump({'rrname':'fred',
'rdata':[str(k) for k in range(10)],
'extra':extra},fp)
fp.write('\n')
if __name__ == "__main__":
if 'create' in sys.argv:
create_files(path, 1000, 100000)
sys.exit(0)
files_list = glob(os.path.join(path, '*.json'))
print('processing', len(files_list), 'files in', path)
print("Using %d cores" % cores)
pp = Pool(processes=cores)
total = 0
start = time()
for result in pp.imap_unordered(process_file, files_list):
total += result
pp.close()
pp.join()
delta = time() - start
mb = total/1000000
print('%d MB total, %d MB/s' % (mb, mb/delta))
For installation:
pip install orjson
For import:
import orjson as json
This works especially if you want to dump or load arrays of large size.

Categories