Run the Same Code for Multiple Files Python - python

I've written a code as you can see (I know it could have written better). So my question is how can I run this code for multiple files in a folder?. Can I use "for loop" for this? My files are in a folder called task1 and to be honest I couldn't figure out the way to do it. Thank you for spending your time to read it.
bands = list()
filename = "file1000"
with open (filename) as fin:
for line in fin:
bands.append(line.strip())
def partition(bands, start, end):
pivot = bands[start]
low = start + 1
high = end
while True:
while low <= high and bands[high] >= pivot:
high = high - 1
while low <= high and bands[low] <= pivot:
low = low + 1
if low <= high:
bands[low], bands[high] = bands[high], bands[low]
else:
break
bands[start], bands[high] = bands[high], bands[start]
return high
def quick_sort(array, start, end):
if start >= end:
return
p = partition(array, start, end)
quick_sort(array, start, p-1)
quick_sort(array, p+1, end)
def heapify(bands, n, i):
largest = i
l = 2 * i + 1
r = 2 * i + 2
if l < n and bands[i] < bands[l]:
largest = l
if r < n and bands[largest] < bands[r]:
largest = r
if largest != i:
bands[i], bands[largest] = bands[largest], bands[i]
heapify(bands, n, largest)
def heapSort(bands):
n = len(bands)
for i in range(n, -1, -1):
heapify(bands, n, i)
for i in range(n - 1, 0, -1):
bands[i], bands[0] = bands[0], bands[i]
heapify(bands, i, 0)
def mergeSort(bands):
if len(bands) > 1:
mid = len(bands) // 2
L = bands[:mid]
R = bands[mid:]
mergeSort(L)
mergeSort(R)
i = j = k = 0
while i < len(L) and j < len(R):
if L[i] < R[j]:
bands[k] = L[i]
i += 1
else:
bands[k] = R[j]
j += 1
k += 1
while i < len(L):
bands[k] = L[i]
i += 1
k += 1
while j < len(R):
bands[k] = R[j]
j += 1
k += 1
def insertionSort(bands):
for i in range(1, len(bands)):
key = bands[i]
j = i - 1
while j >= 0 and key < bands[j]:
bands[j + 1] = bands[j]
j -= 1
bands[j + 1] = key
import time
start_time = time.time()
quick_sort(bands, 0, len(bands) - 1)
file = open("time.txt","a")
file.write(str(time.time() - start_time))
file.write(" ")
file.close()
start_time = time.time()
heapSort(bands)
file = open("time.txt","a")
file.write(str(time.time() - start_time))
file.write(" ")
file.close()
start_time = time.time()
mergeSort(bands)
file = open("time.txt","a")
file.write(str(time.time() - start_time))
file.write(" ")
file.close()
start_time = time.time()
insertionSort(bands)
file = open("time.txt","a")
file.write(str(time.time() - start_time))
file.write(" ")
file.close()

You can use os.listdir(folder) to get all names in folder (it will be names of files and subfolders) and then you can use for-loop to run your code with every filename. listdir() gives only filenames and you need os.path.join() to create full path to file. You can also use if to filter names.
import os
folder = "/path/to/assignment"
for name in os.listdir(folder):
if name.startswith("file"): # use it if you have to filter files by name
filename = os.path.join(folder, name)
print(filename)
# ... your code ...
Eventually you can use glob for this. It can be useful if you wan to filter names. For all names use *. To filer you can use ie *.txt or file*, etc.
import glob
#for filename in glob.glob("/path/to/assignment/file*.txt"):
for filename in glob.glob("/path/to/assignment/*"):
print(filename)
# ... your code ...
If you will need to get also in subfolders then you can use os.walk(folder)
import os
folder = "/path/to/assignment"
for root, dirs, files in os.walk(folder):
for name in files:
if name.startswith("file"): # use it if you have to filter files by name
filename = os.path.join(root, name)
print(filename)
# ... your code ...

You can use:
d="**Provide the directory here**"
files=os.listdir(d)
file=[i[:-4] for i in files] #To store the csv file name as DataFrame name without the '.csv' part
a=[]
for i in range(len(files)):
exec("%s=pd.read_csv(d+files[i])"%file[i])
a.append(file[i])
Now you the list of DataFrames in 'a'. You can iterate for each of them and pass it to your function.

Related

Looking for resistance genes in water sample using kmers [Python]

I need some help with my code. I need to look for the presence of resistance genes in a water sample. That translates in having a huge file of reads coming from the water sample and a file of resistances genes. My problem is making the code run under 5 minutes, a thing that is not happening right now. Probably the issue relays on discarting reads as fast as possible, on having a smart method to only analyze meaningful reads. Do you have any suggestion? I cannot use any non standard python library
This is my code
import time
def build_lyb(TargetFile):
TargetFile = open(TargetFile)
res_gen = {}
for line in TargetFile:
if line.startswith(">"):
header = line[:-1]
res_gen[header] = ""
else:
res_gen[header] += line[:-1]
return res_gen
def build_kmers(sequence, k_size):
kmers = []
n_kmers = len(sequence) - k_size + 1
for i in range(n_kmers):
kmer = sequence[i:i + k_size]
kmers.append(kmer)
return kmers
def calculation(kmers, g):
matches = []
for i in range(0, len(genes[g])):
matches.append(0)
k = 0
while k < len(kmers):
if kmers[k] in genes[g]:
pos = genes[g].find(kmers[k])
for i in range(pos, pos+19):
matches[i] = 1
k += 19
else:
k += 1
return matches
def coverage(matches, g):
counter = 0
for i in matches[g]:
if i >= 1:
counter += 1
cov = counter/len(res_genes[g])*100
return cov
st = time.time()
genes = build_lyb("resistance_genes.fsa")
infile = open('test2.txt', 'r')
res_genes = {}
Flag = False
n_line = 0
for line in infile:
n_line += 1
if line.startswith("+"):
Flag = False
if Flag:
kmers = build_kmers(line[:-1], 19)
for g in genes:
counter = 18
k = 20
while k <= 41:
if kmers[k] in genes[g]:
counter += 19
k += 19
else:
k += 1
if counter >= 56:
print(n_line)
l1 = calculation(kmers, g)
if g in res_genes:
l2 = res_genes[g]
lr = [sum(i) for i in zip(l1, l2)]
res_genes[g] = lr
else:
res_genes[g] = l1
if line.startswith('#'):
Flag = True
for g in res_genes:
print(g)
for i in genes[g]:
print(i, " ", end='')
print('')
for i in res_genes[g]:
print(i, " ", end='')
print('')
print(coverage(res_genes, g))
et = time.time()
elapsed_time = et-st
print("Execution time:", elapsed_time, "s")

results of Iterative and recursion benchmark of QuickSort should looks like this?

I just create benchmark to compare speed of two implementation of Quick sort.
Iterative and recursion.
I expected than recursive will be slower, but I got that plot (blue is rec):
It's possible that recursion is faster? Maybe I just do some mistake in my code?
Just in case I pase my code.
import time
import random
import sys
arrayList = []
arr = [random.randint(1,15000) for _ in range(1000)]
numbersList = [100000, 300000, 500000, 900000, 1000000, 1500000]
numbersForBenchmark = []
for i in range(len(numbersList)):
arr = [random.randint(1,15000) for _ in range(numbersList[i])]
numbersForBenchmark.append(arr)
print(numbersForBenchmark)
recursionTimeArray = []
iterationTimeArray = []
arrRe = arr
arrIt = arr
def partition(lst, start, end):
pos = start
for i in range(start, end):
if lst[i] < lst[end]: # in your version it always goes from 0
lst[i],lst[pos] = lst[pos],lst[i]
pos += 1
lst[pos],lst[end] = lst[end],lst[pos] # you forgot to put the pivot
# back in its place
return pos
def quick_sort_recursive(lst, start, end):
if start < end: # this is enough to end recursion
pos = partition(lst, start, end)
quick_sort_recursive(lst, start, pos - 1)
quick_sort_recursive(lst, pos + 1, end)
#print(lst)
def iter(arr,l,h):
i = ( l - 1 )
x = arr[h]
for j in range(l , h):
if arr[j] <= x:
# increment index of smaller element
i = i+1
arr[i],arr[j] = arr[j],arr[i]
arr[i+1],arr[h] = arr[h],arr[i+1]
return (i+1)
def quickSortIterative(arr,l,h):
size = h - l + 1
stack = [0] * (size)
top = -1
top = top + 1
stack[top] = l
top = top + 1
stack[top] = h
while top >= 0:
# Pop h and l
h = stack[top]
top = top - 1
l = stack[top]
top = top - 1
p = iter( arr, l, h )
if p-1 > l:
top = top + 1
stack[top] = l
top = top + 1
stack[top] = p - 1
if p+1 < h:
top = top + 1
stack[top] = p + 1
top = top + 1
stack[top] = h
for i in range(len(numbersForBenchmark)):
arrRe = numbersForBenchmark[i][:]
arrIt = numbersForBenchmark[i][:]
n = len(arrIt)
start = time.time()
quickSortIterative(arrIt, 0, n-1)
end = time.time()
ITime = end - start
iterationTimeArray.append(ITime)
try:
n = len(arrRe)
start = time.time()
quick_sort_recursive(arrRe,0,n-1)
end = time.time()
rekTime = end - start
recursionTimeArray.append(rekTime)
except RecursionError as re:
print('Sorry but this maze solver was not able to finish '
'analyzing the maze: {}'.format(re.args[0]))
print("REK time", recursionTimeArray)
print("ITER TIME", iterationTimeArray)
# evenly sampled time at 200ms intervals
import matplotlib.pyplot as plt
plt.plot([10,100,500,1000,5000,8000 ], recursionTimeArray,[10,100,500,1000,5000,8000], iterationTimeArray)
plt.show()
The plots look OK, but I expected a completely different result. Hence my doubts about the results.

Python returns ints when one of variable inside function is active

The main idea is:
searchindex() - repeat binary search algorithm over the list of random data with looking back and with fix.(variable counter1 should save number of occurences)
occur() - just assumning total number of occurences.
Please help to find a problem.
I always get counter1 = 0 after running a code.
def searchindex(searchlist, secs, x):
ts = calendar.timegm(time.gmtime())
delta_minute_ts = (ts - (ts % 60)) - secs
last_minute = datetime.datetime.fromtimestamp(delta_minute_ts).strftime('%Y-%m-%d %H:%M')
start = 0
end = len(searchlist) - 1
counter1 = 0
while (start <= end):
mid = (start + end) // 2
if (searchlist[mid] == last_minute):
counter1 = int(mid)
if x == 1:
end = mid - 1
else:
start = mid + 1
elif (last_minute < searchlist[mid]):
end = mid - 1
else:
start = mid + 1
return counter1
def occur():
start_result = searchindex(new1, 60, 1)
end_result = searchindex(new1, 60, 2)
if start_result is None:
return 'no results'
else:
end_result - start_result + 1

list assignment index out of range by code python?

I keep getting an
IndexError: list assignment index out of range.
The error on line 78
This code is written to find motif DNA to bioinformatics
How we can solve this error or the problem ?
Here is my code:
from math import log
class MotifMedianFinding(object):
def __init__(self, input_file):
super(MotifMedianFinding, self).__init__()
self.input_lines = open("C:\\Users\\A.Khassawneh\\Desktop\\fasta.txt")
def output(self):
#main method to call both functions
sequences = {}
for line in self.input_lines:
if '>' in line:
sequences[line] = self.input_lines.next()
for label, seq in sequences.iteritems():
print "DNA:" + seq + "\n\n\n\n\n"
median = self.median_string(seq, 5,5, len(seq))
self.motif(seq, median,5,len(seq))
def median_string(self, dna, t, n, l):
#bound and search method of calulating median string
start_pos = start_pos = [1,1,1,1,1]
best_dist = 1000000000
i = 1
while i > 0:
if i < l:
prefix = str(start_pos)
opt_dist = self.hamming_score(prefix, dna)
if opt_dist > best_dist:
s,i = self.bypass(start_pos,i,l,4)
else:
s,i = self.next_vertex(start_pos,i,l,4)
else:
word = str(s)
if self.hamming_score(word, dna) < best_dist:
best_dist = self.hamming_score(word, dna)
bestword = word
s,i = self.next_vertex(start_pos,i,l,4)
print "Best Word: %s (tot_dis = %s)" % (bestword,best_dist)
return bestword
def motif(self, dna, t, n, l):
#bound and search method of calculating motif
start_pos = [1,1,1,1,1]
best_score = 0
i = 1
while 1 > 0:
if i < t:
opt_score = Score(s, i, dna) + (t-1) * l
if opt_score < best_score:
start_pos, i = self.bypass(start_pos, i, t, n-l+1)
else:
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
else:
if self.score(start_pos, dna) > best_score:
best_score = self.score(start_pos)
best_motif = str(s)
start_pos, i = self.next_vertex(start_pos, i, t, n-l+1)
print "motif consensus string: %s (consensus_score = %s) " % (best_motif, best_score)
print "motif positions/string s=(s1..st): %s" % ', '.join(start_pos)
return best_motif
def bypass(vertex, level, l, k):
#skip uncessary calculations in the tree
j = level
for ind in xrange(j,1,-1):
if a[j] < k:
a[j] = a[j] + 1
return vertex, j
return vertex, 0
def next_vertex(self, vertex, level, L, k):
#transverse the tree of a strand of genes
if level <L:
vertex[level+1] = 1
return vertex,level+1
else:
j = L
for ind in xrange(j,1,-1):
if vertex[ind] < k:
vertex[j] = vertex[j] + 1
return vertex, j
return vertex, 0
def score(start_pos):
# biggest score of motif
total = 0
for i in start_pos:
total += i
return total
def hamming_score(self, s, dna):
pass
motif_median = MotifMedianFinding('HMP-part.fa')
motif_median.output()
xrange(x,y) goes from x to y-1 (x, x+1.... y-1). In your code, it would have been fine to do xrange(1,j), because that wouldn't have included j. But if you swap it to xrange(j,1,-1), you go (j, j-1.... 2).
Basically, you probably need to change it to xrange(j-1,0,-1) depending on your intended range.

Index out of Range while counting inversions

I am trying to count the number of inversions in a .txt file given as an argument in a command line input. When ever it gets to line that actually checks if there is an inversion I get a index out of range error. I have tried writing down the place and value in i and j for each loop but I can't figure out how to stop it from going out of range. Here is the error
File "./counting_inversions.py", line 31, in sortAndCountSplit
if (l[i] <= r[j]):
IndexError: list index out of range
Does any one else know a solution?
import argparse
def readFile():
arg_parser = argparse.ArgumentParser(description='Print the given input file.')
arg_parser.add_argument('filename', help='path to a file')
args = arg_parser.parse_args()
with open(args.filename, 'r') as in_file:
n = int(in_file.readline())
vals = [int(val) for val in in_file.readlines()]
return([n, vals])
def sortAndCount(invList):
if (len(invList) == 1):
return (invList, 0)
else:
midpoint = int(len(invList) / 2)
left, lc = sortAndCount(invList[:midpoint])
right, rc = sortAndCount(invList[midpoint:])
arr, sc = sortAndCountSplit(left, right)
return (arr, (lc + rc + sc))
def sortAndCountSplit(l, r):
s = []
i = j = inversions = 0
for k in range((len(l) + len(r))):
if ((i < len(l)) and (l[i] <= r[j]) or j >= len(r)):
s.append(l[i])
i += 1
else:
s.append(r[j])
j += 1
inversions += len(l) - i
return (s, inversions)
def main():
file = readFile()
print(sortAndCount(file[1]))
main()

Categories