Getting Out of Memory Error for Join Algorithm - python

I got a dataset, sitting in a .txt file, consisting of 10 million rows in the form of RDF triples, like such:
wsdbm:User0 wsdbm:follows wsdbm:User300 .
wsdbm:User6 wsdbm:likes wsdbm:Product92 .
wsdbm:Product0 rev:hasReview wsdbm:Review478 .
wsdbm:User2 wsdbm:friendOf wsdbm:User119 .
....
Since these are RDF triples, in our case we have
Subjects: User0, User6, Product, User2
Predicates: follows, likes, hasReview, friendOf
Objects: User300, Product92, Review478, User119
My goal is to write a query in the SQL form:
SELECT follows.subject, follows.object, friendOf.object,
likes.object, hasReview.object
FROM follows, friendOf, likes, hasReview
WHERE follows.object = friendOf.subject
AND friendOf.object = likes.subject
AND likes.object = hasReview.subject
So far, I create a class called PropertyTables, which has a method that iterates over the initial file and convert each subject, predicate and object into an integer to improve computational time on the join and save memory:
class PropertyTables():
"""
This class holds all 4 Property Tables necessary for the required query.
Each Property Table is an instance of the class 'PropertyTable'.
"""
def __init__(self):
self.property_tables = defaultdict()
self.hash_map = HashDict()
def parse_file(self, file_path, remove_prefix = False):
data = open(file_path, 'r')
for line in data:
subj, prop, *obj = line.rstrip('\n.').split('\t')
obj = obj[0].rstrip()
if remove_prefix:
subj, prop, obj = [self.remove_prefix(s) for s in (subj, prop, obj)]
if prop in ['follows', 'friendOf', 'likes', 'hasReview']:
self.hash_and_store(subj, prop, obj)
data.close()
the class PropertyTable, mentioned in the docstring:
class PropertyTable():
"""
This class represents a single Property Table, i.e. it holds every Subject and Object
"""
def __init__(self):
self.table = []
def insert(self, r, s):
# If r and s are already tuples, they get appended to the Property Table.
# Otherwise, we convert them to a tuple beforehand. This is mostly relevant when creating the
# Property Tables when reading the data.
if type(r) == tuple:
self.table.append(r + s)
else:
self.table.append((r, s))
The class HashDict() is a simple dictionary that hashes values, so we can retrieve them again after the join.
To not go to far with one post, I have now a single hash join algorithm:
def hash_join(self, property_1: PropertyTable, index_0, property_2: PropertyTable, index_1):
ht = defaultdict(list)
# Create Hash Table for table1
for s in property_1.table:
ht[s[index_0]].append(s)
# Join Tables
joined_table = PropertyTable()
for r in property_2.table:
for s in ht[r[index_1]]:
joined_table.insert(s, r)
return joined_table
I use this function to sequentially join each table, given the requirements from before.
WHERE follows.object = friendOf.subject
AND friendOf.object = likes.subject
AND likes.object = hasReview.subject
join_follows_friendOf = hash_join(pt.property_tables['follows'], 1, pt.property_tables['friendOf'], 0)
join_friendOf_likes = hash_join(join_follows_friendOf, 3, pt.property_tables['likes'], 0)
join_likes_hasReview = hash_join(join_friendOf_likes, 5, pt.property_tables['hasReview'], 0)
The result is correct for small tables, but 10 million rows simply result in an Out of Memory Error and I am looking for ways to avoid this. I am sorry for this very extensive post, but I guess some details are necessary in order for some advice!
Edit:
Line # Mem usage Increment Occurrences Line Contents
=============================================================
53 68.0 MiB 68.0 MiB 1 #profile
54 def hash_and_store(self, subj, prop, obj):
55
56 68.0 MiB 0.0 MiB 1 hashed_subj, hashed_obj = self.hash_map.hash_values(subj, obj)
57
58 68.0 MiB 0.0 MiB 1 if prop not in self.property_tables:
59 self.property_tables[prop] = PropertyTable()
60 68.0 MiB 0.0 MiB 1 self.property_tables[prop].insert(hashed_subj, hashed_obj)
Line # Mem usage Increment Occurrences Line Contents
=============================================================
32 68.1 MiB 68.1 MiB 1 #profile
33 def parse_file(self, file_path, remove_prefix = False):
34
35 68.1 MiB 0.0 MiB 1 data = open(file_path, 'r')
36
37
38
39
40
41 80.7 MiB 0.3 MiB 109311 for line in data:
42 80.7 MiB 0.0 MiB 109310 subj, prop, *obj = line.rstrip('\n.').split('\t')
43 80.7 MiB 0.5 MiB 109310 obj = obj[0].rstrip()
44
45 80.7 MiB 0.0 MiB 109310 if remove_prefix:
46 80.7 MiB 9.0 MiB 655860 subj, prop, obj = [self.remove_prefix(s) for s in (subj, prop, obj)]
47
48 80.7 MiB 0.0 MiB 109310 if prop in ['follows', 'friendOf', 'likes', 'hasReview']:
49 80.7 MiB 2.8 MiB 80084 self.hash_and_store(subj, prop, obj)
50
51 80.7 MiB 0.0 MiB 1 data.close()
Line # Mem usage Increment Occurrences Line Contents
=============================================================
38 80.7 MiB 80.7 MiB 1 #profile
39 def hash_join(self, property_1: PropertyTable, index_0, property_2: PropertyTable, index_1):
40
41 80.7 MiB 0.0 MiB 1 ht = defaultdict(list)
42
43 # Create Hash Table for table1
44
45 81.2 MiB 0.0 MiB 31888 for s in property_1.table:
46 81.2 MiB 0.5 MiB 31887 ht[s[index_0]].append(s)
47
48 # Join Tables
49
50 81.2 MiB 0.0 MiB 1 joined_table = PropertyTable()
51
52 203.8 MiB 0.0 MiB 45713 for r in property_2.table:
53 203.8 MiB 0.0 MiB 1453580 for s in ht[r[index_1]]:
54 203.8 MiB 122.6 MiB 1407868 joined_table.insert(s, r)
55
56 203.8 MiB 0.0 MiB 1 return joined_table

The core of your question is this:
The result is correct for small tables, but 10 million rows simply result in an Out of Memory Error and I am looking for ways to avoid this.
Following your top-level problem statement but with a less generic structure, we can do something like this:
def runQuery(dataLines):
from collections import defaultdict
pred = dict(zip(['follows','friendOf','likes','hasReview'],range(4)))
tables = [defaultdict(list) for _ in pred]
def encode(s):
if s[-1].isdigit():
i = 0
while s[-1 - i].isdigit():
i += 1
return int(s[-i:])
if any(s.endswith(k) for k in pred):
return sum(v for k, v in pred.items() if s.endswith(k))
return None
for line in dataLines:
if not line:
continue
subj, prop, *obj = line.rstrip('\n.').split('\t')
obj = obj[0].rstrip()
subj, prop, obj = [encode(s) for s in (subj, prop, obj)]
if prop is not None:
tables[prop][subj].append(obj)
tables = [{k:tuple(v) for k, v in table.items()} for table in tables]
#[print(list(pred.keys())[i], tables[i], sep='\n') for i in range(len(pred))]
# create reverse index for subject, object where subject [user] follows object [user]
object_of_follows = defaultdict(set)
for k, v in tables[pred['follows']].items():
for user in v:
object_of_follows[user].add(k)
# create reverse index for subject, object where subject [user] is friendOf object [user]
object_of_friendOf = defaultdict(set)
for k, v in tables[pred['friendOf']].items():
if k in object_of_follows:
for user in v:
object_of_friendOf[user].add(k)
# create reverse index for subject, object where subject [user] likes object [product]
object_of_likes = defaultdict(set)
for k, v in tables[pred['likes']].items():
if k in object_of_friendOf:
for product in v:
object_of_likes[product].add(k)
# create reverse index for subject, object where subject [product] hasReview object [review]
object_of_hasReview = defaultdict(set)
for k, v in tables[pred['hasReview']].items():
if k in object_of_likes:
for review in v:
object_of_hasReview[review].add(k)
def addToResult(result, e):
d = object_of_hasReview[e]
c = {y for x in d for y in object_of_likes[x]}
b = {y for x in c for y in object_of_friendOf[x]}
a = {y for x in b for y in object_of_follows[x]}
toAdd = [(ax, bx, cx, dx, e) for dx in d for cx in c for bx in b for ax in a]
result += toAdd
result = []
for e in object_of_hasReview:
addToResult(result, e)
print(f'result row count {len(result):,}')
return result
Explanation:
Create a list of 4 tables (follows, friendOf, likes, hasReview), each a dictionary mapping subject to a tuple of objects
Create 4 reverse indexes (object_of_follows, object_of_friendOf, object_of_likes, object_of_hasReview); for example:
object_of_follows is a dict that maps each user that is an object in follows to a set of users, each of which is a subject in follows that follows the object
object_of_friendOf is a dict that maps each object (user) in friendOf to a set of users, each of which is a subject (user) associated with the object in friendOf and is in object_of_follows (in other words, is an object for one or more subjects in follows)
etc.
Explode each review that survived in object_of_hasReview into multiple result rows containing each unique result follows.subject, follows.object, friendsOf.object, likes.object, hasReview.object as specified in the query
Return the list of all such exploded rows.
Test code for 10 million lines:
dataLines = []
numFollowers = 1000
numChildren = 10
overlapFactor = max(1, numChildren // 2)
def largerPowerOfTen(x):
y = 1
while x >= y:
y *= 10
return y
aCeil = largerPowerOfTen(numFollowers)
bCeil = largerPowerOfTen(aCeil * numChildren)
cCeil = largerPowerOfTen(bCeil * numChildren)
dCeil = largerPowerOfTen(cCeil * numChildren)
friendOf, likes = set(), set()
for a in range(numFollowers):
for b in range(aCeil + a * overlapFactor, aCeil + a * overlapFactor + numChildren):
dataLines.append(f'wsdbm:User{a} wsdbm:follows wsdbm:User{b} .\n')
for c in range(bCeil + b * overlapFactor, bCeil + b * overlapFactor + numChildren):
if (b,c) not in friendOf:
dataLines.append(f'wsdbm:User{b} wsdbm:friendOf wsdbm:User{c} .\n')
friendOf.add((b,c))
for d in range(cCeil + c * overlapFactor, cCeil + c * overlapFactor + numChildren):
if (c,d) not in likes:
dataLines.append(f'wsdbm:User{c} wsdbm:likes wsdbm:Product{d} .\n')
likes.add((c,d))
for e in range(dCeil * (d + 1), dCeil * (d + 1) + numChildren):
dataLines.append(f'wsdbm:Product{d} wsdbm:hasReview wsdbm:Review{e} .\n')
print(f'dataLines row count {len(dataLines):,}')
from timeit import timeit
n = 1
print(f'Timeit results:')
t = timeit(f"runQuery(dataLines)", setup=f"from __main__ import dataLines, runQuery", number=n) / n
print(f'======== runQuery ran in {t} seconds using {n} iterations')
'''
result = runQuery(dataLines)
print(f'result row count {len(result):,}')
print(f'{"follows.subject":>20}{"follows.object":>20}{"friendsOf.object":>20}{"likes.object":>20}{"hasReview.object":>20}')
[print(f'{a:20}{b:20}{c:20}{d:20}{e:20}') for a,b,c,d,e in result]
'''
Output:
dataLines row count 10,310,350
Timeit results:
result row count 12,398,500
======== runQuery ran in 81.53253880003467 seconds using 1 iterations
Here's input/output from a smaller-scale sample run:
Params
numFollowers = 3
numChildren = 3
overlapFactor = 2
Input (after storing in tables):
follows
{0: (10, 11, 12), 1: (12, 13, 14), 2: (14, 15, 16)}
friendOf
{10: (120, 121, 122), 11: (122, 123, 124), 12: (124, 125, 126), 13: (126, 127, 128), 14: (128, 129, 130), 15: (130, 131, 132), 16: (132, 133, 134)}
likes
{120: (1240, 1241, 1242), 121: (1242, 1243, 1244), 122: (1244, 1245, 1246), 123: (1246, 1247, 1248), 124: (1248, 1249, 1250), 125: (1250, 1251, 1252), 126: (1252, 1253, 1254), 127: (1254, 1255, 1256), 128: (1256, 1257, 1258), 129: (1258, 1259, 1260), 130: (1260, 1261, 1262), 131: (1262, 1263, 1264), 132: (1264, 1265, 1266), 133: (1266, 1267, 1268), 134: (1268, 1269, 1270)}
hasReview
{1240: (12410000, 12410001, 12410002), 1241: (12420000, 12420001, 12420002), 1242: (12430000, 12430001, 12430002, 12430000, 12430001, 12430002), 1243: (12440000, 12440001, 12440002), 1244: (12450000, 12450001, 12450002, 12450000, 12450001, 12450002, 12450000, 12450001, 12450002), 1245: (12460000, 12460001, 12460002, 12460000, 12460001, 12460002), 1246: (12470000, 12470001, 12470002, 12470000, 12470001, 12470002, 12470000, 12470001, 12470002), 1247: (12480000, 12480001, 12480002), 1248: (12490000, 12490001, 12490002, 12490000, 12490001, 12490002, 12490000, 12490001, 12490002, 12490000, 12490001, 12490002), 1249: (12500000, 12500001, 12500002, 12500000, 12500001, 12500002, 12500000, 12500001, 12500002), 1250: (12510000, 12510001, 12510002, 12510000, 12510001, 12510002, 12510000, 12510001, 12510002, 12510000, 12510001, 12510002, 12510000, 12510001, 12510002), 1251: (12520000, 12520001, 12520002, 12520000, 12520001, 12520002), 1252: (12530000, 12530001, 12530002, 12530000, 12530001, 12530002, 12530000, 12530001, 12530002, 12530000, 12530001, 12530002, 12530000, 12530001, 12530002), 1253: (12540000, 12540001, 12540002, 12540000, 12540001, 12540002, 12540000, 12540001, 12540002), 1254: (12550000, 12550001, 12550002, 12550000, 12550001, 12550002, 12550000, 12550001, 12550002, 12550000, 12550001, 12550002), 1255: (12560000, 12560001, 12560002), 1256: (12570000, 12570001, 12570002, 12570000, 12570001, 12570002, 12570000, 12570001, 12570002, 12570000, 12570001, 12570002), 1257: (12580000, 12580001, 12580002, 12580000, 12580001, 12580002, 12580000, 12580001, 12580002), 1258: (12590000, 12590001, 12590002, 12590000, 12590001, 12590002, 12590000, 12590001, 12590002, 12590000, 12590001, 12590002, 12590000, 12590001, 12590002), 1259: (12600000, 12600001, 12600002, 12600000, 12600001, 12600002), 1260: (12610000, 12610001, 12610002, 12610000, 12610001, 12610002, 12610000, 12610001, 12610002, 12610000, 12610001, 12610002, 12610000, 12610001, 12610002), 1261: (12620000, 12620001, 12620002, 12620000, 12620001, 12620002, 12620000, 12620001, 12620002), 1262: (12630000, 12630001, 12630002, 12630000, 12630001, 12630002, 12630000, 12630001, 12630002, 12630000, 12630001, 12630002), 1263: (12640000, 12640001, 12640002), 1264: (12650000, 12650001, 12650002, 12650000, 12650001, 12650002, 12650000, 12650001, 12650002), 1265: (12660000, 12660001, 12660002, 12660000, 12660001, 12660002), 1266: (12670000, 12670001, 12670002, 12670000, 12670001, 12670002, 12670000, 12670001, 12670002), 1267: (12680000, 12680001, 12680002), 1268: (12690000, 12690001, 12690002, 12690000, 12690001, 12690002), 1269: (12700000, 12700001, 12700002), 1270: (12710000, 12710001, 12710002)}
Output
result row count 351
follows.subject follows.object friendsOf.object likes.object hasReview.object
0 10 120 1240 12410000
0 10 120 1240 12410001
0 10 120 1240 12410002
0 10 120 1241 12420000
0 10 120 1241 12420001
0 10 120 1241 12420002
0 10 120 1242 12430000
0 10 121 1242 12430000
0 10 120 1242 12430001
0 10 121 1242 12430001
0 10 120 1242 12430002
0 10 121 1242 12430002
0 10 121 1243 12440000
0 10 121 1243 12440001
0 10 121 1243 12440002
0 10 121 1244 12450000
0 11 121 1244 12450000
0 10 122 1244 12450000
0 11 122 1244 12450000
0 10 121 1244 12450001
0 11 121 1244 12450001
0 10 122 1244 12450001
0 11 122 1244 12450001
0 10 121 1244 12450002
0 11 121 1244 12450002
etc.

Related

Trying to construct a greedy algorithm with python

So i'm trying to create a greedy algorithm for a knapsack problem. The txt file below is the knap20.txt file. The first line gives the number of items, in this case 20. The last line gives the capacity of the knapsack, in this case 524. The remaining lines give the index, value and weight of each item.
My function is to ideally return the solution in a list and the value of the weights
From what I can tell by my results, my program is working correctly. Is it working as you would expect, and how can i improve it?
txt file
20
1 91 29
2 60 65
3 61 71
4 9 60
5 79 45
6 46 71
7 19 22
8 57 97
9 8 6
10 84 91
11 20 57
12 72 60
13 32 49
14 31 89
15 28 2
16 81 30
17 55 90
18 43 25
19 100 82
20 27 19
524
python file
import os
import matplotlib.pyplot as plt
def get_optimal_value(capacity, weights, values):
value = 0.
numItems = len(values)
valuePerWeight = sorted([[values[i] / weights[i], weights[i]] for i in range(numItems)], reverse=True)
while capacity > 0 and numItems > 0:
maxi = 0
idx = None
for i in range(numItems):
if valuePerWeight[i][1] > 0 and maxi < valuePerWeight[i][0]:
maxi = valuePerWeight[i][0]
idx = i
if idx is None:
return 0.
if valuePerWeight[idx][1] <= capacity:
value += valuePerWeight[idx][0]*valuePerWeight[idx][1]
capacity -= valuePerWeight[idx][1]
else:
if valuePerWeight[idx][1] > 0:
value += (capacity / valuePerWeight[idx][1]) * valuePerWeight[idx][1] * valuePerWeight[idx][0]
return values, value
valuePerWeight.pop(idx)
numItems -= 1
return value
def read_kfile(fname):
print('file started')
with open(fname) as kfile:
print('fname found', fname)
lines = kfile.readlines() # reads the whole file
n = int(lines[0])
c = int(lines[n+1])
vs = []
ws = []
lines = lines[1:n+1] # Removes the first and last line
for l in lines:
numbers = l.split() # Converts the string into a list
vs.append(int(numbers[1])) # Appends value, need to convert to int
ws.append(int(numbers[2])) # Appends weigth, need to convert to int
return n, c, vs, ws
dir_path = os.path.dirname(os.path.realpath(__file__)) # Get the directory where the file is located
os.chdir(dir_path) # Change the working directory so we can read the file
knapfile = 'knap20.txt'
nitems, capacity, values, weights = read_kfile(knapfile)
val1,val2 = get_optimal_value(capacity, weights, values)
print ('values',val1)
print('value',val2)
result
values [91, 60, 61, 9, 79, 46, 19, 57, 8, 84, 20, 72, 32, 31, 28, 81, 55, 43, 100, 27]
value 733.2394366197183

Memory Profiler giving constant memory in all steps

I want to get the change in memory for every step in my function.
I have written the code for interpolation search and even given a input as large as 10000 no. of elements in a list, but still no change in memory.
The code is:
import time
from memory_profiler import profile
#profile()
def interpolation_search(numbers, value):
low = 0
high = len(numbers) - 1
mid = 0
while numbers[low] <= value and numbers[high] >= value:
mid = low + ((value - numbers[low]) * (high - low)) / (numbers[high] - numbers[low])
if numbers[mid] < value:
low = mid + 1
elif numbers[mid] > value:
high = mid - 1
else:
return mid
if numbers[low] == value:
return low
else:
return -1
if __name__ == "__main__":
# Pre-sorted numbers
numbers = [-100, -6, 0, 1, 5, 14, 15, 26,28,29,30,31,35,37,39,40,41,42]
num=[]
for i in range(100000):
num.append(i)
value = 15
# Print numbers to search
print 'Numbers:'
print ' '.join([str(i) for i in numbers])
# Find the index of 'value'
start_time1 = time.time()
index = interpolation_search(numbers, value)
# Print the index where 'value' is located
print '\nNumber %d is at index %d' % (value, index)
print("--- Run Time %s seconds---" % (time.time() - start_time1))
The output that I am getting is:
Numbers:
-100 -6 0 1 5 14 15 26 28 29 30 31 35 37 39 40 41 42
Filename: C:/Users/Admin/PycharmProjects/timenspace/Interpolation.py
Line # Mem usage Increment Line Contents
================================================
4 21.5 MiB 0.0 MiB #profile()
5 def interpolation_search(numbers, value):
6 21.5 MiB 0.0 MiB low = 0
7 21.5 MiB 0.0 MiB high = len(numbers) - 1
8 21.5 MiB 0.0 MiB mid = 0
9
10 21.5 MiB 0.0 MiB while numbers[low] <= value and numbers[high] >= value:
11 21.5 MiB 0.0 MiB mid = low + ((value - numbers[low]) * (high - low)) / (numbers[high] - numbers[low])
12
13 21.5 MiB 0.0 MiB if numbers[mid] < value:
14 low = mid + 1
15
16 21.5 MiB 0.0 MiB elif numbers[mid] > value:
17 21.5 MiB 0.0 MiB high = mid - 1
18 else:
19 21.5 MiB 0.0 MiB return mid
20
21 if numbers[low] == value:
22 return low
23 else:
24 return -1
Number 15 is at index 6
--- Run Time 0.0429999828339 seconds---
As you can see my memory remains constant at 21.5 Mib in all steps.
Please help me with this.Thank You
Why do you expect it to increase? I don't see any memory allocations, i.e., the array numbers does not grow in size

Unsure why program similar to bubble-sort is not working

I have been working on a programming challenge, problem here, which basically states:
Given integer array, you are to iterate through all pairs of neighbor
elements, starting from beginning - and swap members of each pair
where first element is greater than second.
And then return the amount of swaps made and the checksum of the final answer. My program seemingly does both the sorting and the checksum according to how it wants. But my final answer is off for everything but the test input they gave.
So: 1 4 3 2 6 5 -1
Results in the correct output: 3 5242536 with my program.
But something like:
2 96 7439 92999 240 70748 3 842 74 706 4 86 7 463 1871 7963 904 327 6268 20955 92662 278 57 8 5912 724 70916 13 388 1 697 99666 6924 2 100 186 37504 1 27631 59556 33041 87 9 45276 -1
Results in: 39 1291223 when the correct answer is 39 3485793.
Here's what I have at the moment:
# Python 2.7
def check_sum(data):
data = [str(x) for x in str(data)[::]]
numbers = len(data)
result = 0
for number in range(numbers):
result += int(data[number])
result *= 113
result %= 10000007
return(str(result))
def bubble_in_array(data):
numbers = data[:-1]
numbers = [int(x) for x in numbers]
swap_count = 0
for x in range(len(numbers)-1):
if numbers[x] > numbers[x+1]:
temp = numbers[x+1]
numbers[x+1] = numbers[x]
numbers[x] = temp
swap_count += 1
raw_number = int(''.join([str(x) for x in numbers]))
print('%s %s') % (str(swap_count), check_sum(raw_number))
bubble_in_array(raw_input().split())
Does anyone have any idea where I am going wrong?
The issue is with your way of calculating Checksum. It fails when the array has numbers with more than one digit. For example:
2 96 7439 92999 240 70748 3 842 74 706 4 86 7 463 1871 7963 904 327 6268 20955 92662 278 57 8 5912 724 70916 13 388 1 697 99666 6924 2 100 186 37504 1 27631 59556 33041 87 9 45276 -1
You are calculating Checksum for 2967439240707483842747064867463187179639043276268209559266227857859127247091613388169792999692421001863750412763159556330418794527699666
digit by digit while you should calculate the Checksum of [2, 96, 7439, 240, 70748, 3, 842, 74, 706, 4, 86, 7, 463, 1871, 7963, 904, 327, 6268, 20955, 92662, 278, 57, 8, 5912, 724, 70916, 13, 388, 1, 697, 92999, 6924, 2, 100, 186, 37504, 1, 27631, 59556, 33041, 87, 9, 45276, 99666]
The fix:
# Python 2.7
def check_sum(data):
result = 0
for number in data:
result += number
result *= 113
result %= 10000007
return(result)
def bubble_in_array(data):
numbers = [int(x) for x in data[:-1]]
swap_count = 0
for x in xrange(len(numbers)-1):
if numbers[x] > numbers[x+1]:
numbers[x+1], numbers[x] = numbers[x], numbers[x+1]
swap_count += 1
print('%d %d') % (swap_count, check_sum(numbers))
bubble_in_array(raw_input().split())
More notes:
To swap two variables in Python, you dont need to use a temp variable, just use a,b = b,a.
In python 2.X, use xrange instead of range.

Python selecting items by comparing values in a table using dictionary

I have a table with 12 columns and want to select the items in the first column (qseqid) based on the second column (sseqid). Meaning that the second column (sseqid) is repeating with different values in the 11th and 12th columns, which areevalueandbitscore, respectively.
The ones that I would like to get are having the lowestevalueand the highestbitscore(whenevalues are the same, the rest of the columns can be ignored and the data is down below).
So, I have made a short code which uses the second columns as a key for the dictionary. I can get five different items from the second column with lists of qseqid+evalueandqseqid+bitscore.
Here is the code:
#!usr/bin/python
filename = "data.txt"
readfile = open(filename,"r")
d = dict()
for i in readfile.readlines():
i = i.strip()
i = i.split("\t")
d.setdefault(i[1], []).append([i[0],i[10]])
d.setdefault(i[1], []).append([i[0],i[11]])
for x in d:
print(x,d[x])
readfile.close()
But, I am struggling to get the qseqid with the lowest evalue and the highest bitscore for each sseqid.
Is there any good logic to solve the problem?
Thedata.txtfile (including the header row and with»representing tab characters)
qseqid»sseqid»pident»length»mismatch»gapopen»qstart»qend»sstart»send»evalue»bitscore
ACLA_022040»TBB»32.71»431»258»8»39»468»24»423»2.00E-76»240
ACLA_024600»TBB»80»435»87»0»1»435»1»435»0»729
ACLA_031860»TBB»39.74»453»251»3»1»447»1»437»1.00E-121»357
ACLA_046030»TBB»75.81»434»105»0»1»434»1»434»0»704
ACLA_072490»TBB»41.7»446»245»3»4»447»3»435»2.00E-120»353
ACLA_010400»EF1A»27.31»249»127»8»69»286»9»234»3.00E-13»61.6
ACLA_015630»EF1A»22»491»255»17»186»602»3»439»8.00E-19»78.2
ACLA_016510»EF1A»26.23»122»61»4»21»127»9»116»2.00E-08»46.2
ACLA_023300»EF1A»29.31»447»249»12»48»437»3»439»2.00E-45»155
ACLA_028450»EF1A»85.55»443»63»1»1»443»1»442»0»801
ACLA_074730»CALM»23.13»147»101»4»6»143»2»145»7.00E-08»41.2
ACLA_096170»CALM»29.33»150»96»4»34»179»2»145»1.00E-13»55.1
ACLA_016630»CALM»23.9»159»106»5»58»216»4»147»5.00E-12»51.2
ACLA_031930»RPB2»36.87»1226»633»24»121»1237»26»1219»0»734
ACLA_065630»RPB2»65.79»1257»386»14»1»1252»4»1221»0»1691
ACLA_082370»RPB2»27.69»1228»667»37»31»1132»35»1167»7.00E-110»365
ACLA_061960»ACT»28.57»147»95»5»146»284»69»213»3.00E-12»57.4
ACLA_068200»ACT»28.73»463»231»13»16»471»4»374»1.00E-53»176
ACLA_069960»ACT»24.11»141»97»4»581»718»242»375»9.00E-09»46.2
ACLA_095800»ACT»91.73»375»31»0»1»375»1»375»0»732
And here's a little more readable version of the table's contents:
0 1 2 3 4 5 6 7 8 9 10 11
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
ACLA_022040 TBB 32.71 431 258 8 39 468 24 423 2.00E-76 240
ACLA_024600 TBB 80 435 87 0 1 435 1 435 0 729
ACLA_031860 TBB 39.74 453 251 3 1 447 1 437 1.00E-121 357
ACLA_046030 TBB 75.81 434 105 0 1 434 1 434 0 704
ACLA_072490 TBB 41.7 446 245 3 4 447 3 435 2.00E-120 353
ACLA_010400 EF1A 27.31 249 127 8 69 286 9 234 3.00E-13 61.6
ACLA_015630 EF1A 22 491 255 17 186 602 3 439 8.00E-19 78.2
ACLA_016510 EF1A 26.23 122 61 4 21 127 9 116 2.00E-08 46.2
ACLA_023300 EF1A 29.31 447 249 12 48 437 3 439 2.00E-45 155
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0 801
ACLA_074730 CALM 23.13 147 101 4 6 143 2 145 7.00E-08 41.2
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1.00E-13 55.1
ACLA_016630 CALM 23.9 159 106 5 58 216 4 147 5.00E-12 51.2
ACLA_031930 RPB2 36.87 1226 633 24 121 1237 26 1219 0 734
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0 1691
ACLA_082370 RPB2 27.69 1228 667 37 31 1132 35 1167 7.00E-110 365
ACLA_061960 ACT 28.57 147 95 5 146 284 69 213 3.00E-12 57.4
ACLA_068200 ACT 28.73 463 231 13 16 471 4 374 1.00E-53 176
ACLA_069960 ACT 24.11 141 97 4 581 718 242 375 9.00E-09 46.2
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0 732
Since you're a Python newbie I'm glad that there are several examples of how to this manually, but for comparison I'll show how it can be done using the pandas library which makes working with tabular data much simpler.
Since you didn't provide example output, I'm assuming that by "with the lowest evalue and the highest bitscore for each sseqid" you mean "the highest bitscore among the lowest evalues" for a given sseqid; if you want those separately, that's trivial too.
import pandas as pd
df = pd.read_csv("acla1.dat", sep="\t")
df = df.sort(["evalue", "bitscore"],ascending=[True, False])
df_new = df.groupby("sseqid", as_index=False).first()
which produces
>>> df_new
sseqid qseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
0 ACT ACLA_095800 91.73 375 31 0 1 375 1 375 0.000000e+00 732.0
1 CALM ACLA_096170 29.33 150 96 4 34 179 2 145 1.000000e-13 55.1
2 EF1A ACLA_028450 85.55 443 63 1 1 443 1 442 0.000000e+00 801.0
3 RPB2 ACLA_065630 65.79 1257 386 14 1 1252 4 1221 0.000000e+00 1691.0
4 TBB ACLA_024600 80.00 435 87 0 1 435 1 435 0.000000e+00 729.0
Basically, first we read the data file into an object called a DataFrame, which is kind of like an Excel worksheet. Then we sort by evalue ascending (so that lower evalues come first) and by bitscore descending (so that higher bitscores come first). Then we can use groupby to collect the data in groups of equal sseqid, and take the first one in each group, which because of the sorting will be the one we want.
#!usr/bin/python
import csv
DATA = "data.txt"
class Sequence:
def __init__(self, row):
self.qseqid = row[0]
self.sseqid = row[1]
self.pident = float(row[2])
self.length = int(row[3])
self.mismatch = int(row[4])
self.gapopen = int(row[5])
self.qstart = int(row[6])
self.qend = int(row[7])
self.sstart = int(row[8])
self.send = int(row[9])
self.evalue = float(row[10])
self.bitscore = float(row[11])
def __str__(self):
return (
"{qseqid}\t"
"{sseqid}\t"
"{pident}\t"
"{length}\t"
"{mismatch}\t"
"{gapopen}\t"
"{qstart}\t"
"{qend}\t"
"{sstart}\t"
"{send}\t"
"{evalue}\t"
"{bitscore}"
).format(**self.__dict__)
def entries(fname, header_rows=1, dtype=list, **kwargs):
with open(fname) as inf:
incsv = csv.reader(inf, **kwargs)
# skip header rows
for i in range(header_rows):
next(incsv)
for row in incsv:
yield dtype(row)
def main():
bestseq = {}
for seq in entries(DATA, dtype=Sequence, delimiter="\t"):
# see if a sequence with the same sseqid already exists
prev = bestseq.get(seq.sseqid, None)
if (
prev is None
or seq.evalue < prev.evalue
or (seq.evalue == prev.evalue and seq.bitscore > prev.bitscore)
):
bestseq[seq.sseqid] = seq
# display selected sequences
keys = sorted(bestseq)
for key in keys:
print(bestseq[key])
if __name__ == "__main__":
main()
which results in
ACLA_095800 ACT 91.73 375 31 0 1 375 1 375 0.0 732.0
ACLA_096170 CALM 29.33 150 96 4 34 179 2 145 1e-13 55.1
ACLA_028450 EF1A 85.55 443 63 1 1 443 1 442 0.0 801.0
ACLA_065630 RPB2 65.79 1257 386 14 1 1252 4 1221 0.0 1691.0
ACLA_024600 TBB 80.0 435 87 0 1 435 1 435 0.0 729.0
While not nearly as elegant and concise as using thepandaslibrary, it's quite possible to do what you want without resorting to third-party modules. The following uses thecollections.defaultdictclass to facilitate creation of dictionaries of variable-length lists of records. The use of theAttrDictclass is optional, but it makes accessing the fields of each dictionary-based records easier and is less awkward-looking than the usualdict['fieldname']syntax otherwise required.
import csv
from collections import defaultdict, namedtuple
from itertools import imap
from operator import itemgetter
data_file_name = 'data.txt'
DELIMITER = '\t'
ssqeid_dict = defaultdict(list)
# from http://stackoverflow.com/a/1144405/355230
def multikeysort(items, columns):
comparers = [((itemgetter(col[1:].strip()), -1) if col.startswith('-') else
(itemgetter(col.strip()), 1)) for col in columns]
def comparer(left, right):
for fn, mult in comparers:
result = cmp(fn(left), fn(right))
if result:
return mult * result
else:
return 0
return sorted(items, cmp=comparer)
# from http://stackoverflow.com/a/15109345/355230
class AttrDict(dict):
def __init__(self, *args, **kwargs):
super(AttrDict, self).__init__(*args, **kwargs)
self.__dict__ = self
with open(data_file_name, 'rb') as data_file:
reader = csv.DictReader(data_file, delimiter=DELIMITER)
format_spec = '\t'.join([('{%s}' % field) for field in reader.fieldnames])
for rec in (AttrDict(r) for r in reader):
# Convert the two sort fields to numeric values for proper ordering.
rec.evalue, rec.bitscore = map(float, (rec.evalue, rec.bitscore))
ssqeid_dict[rec.sseqid].append(rec)
for ssqeid in sorted(ssqeid_dict):
# Sort each group of recs with same ssqeid. The first record after sorting
# will be the one sought that has the lowest evalue and highest bitscore.
selected = multikeysort(ssqeid_dict[ssqeid], ['evalue', '-bitscore'])[0]
print format_spec.format(**selected)
Output (»represents tabs):
ACLA_095800» ACT» 91.73» 375» 31» 0» 1» 375» 1» 375» 0.0» 732.0
ACLA_096170» CALM» 29.33» 150» 96» 4» 34» 179» 2» 145» 1e-13» 55.1
ACLA_028450» EF1A» 85.55» 443» 63» 1» 1» 443» 1» 442» 0.0» 801.0
ACLA_065630» RPB2» 65.79» 1257» 386» 14» 1» 1252» 4» 1221» 0.0» 1691.0
ACLA_024600» TBB» 80» 435» 87» 0» 1» 435» 1» 435» 0.0» 729.0
filename = 'data.txt'
readfile = open(filename,'r')
d = dict()
sseqid=[]
lines=[]
for i in readfile.readlines():
sseqid.append(i.rsplit()[1])
lines.append(i.rsplit())
sorted_sseqid = sorted(set(sseqid))
sdqDict={}
key =None
for sorted_ssqd in sorted_sseqid:
key=sorted_ssqd
evalue=[]
bitscore=[]
qseid=[]
for line in lines:
if key in line:
evalue.append(line[10])
bitscore.append(line[11])
qseid.append(line[0])
sdqDict[key]=[qseid,evalue,bitscore]
print sdqDict
print 'TBB LOWEST EVALUE' + '---->' + min(sdqDict['TBB'][1])
##I think you can do the list manipulation below to find out the qseqid
readfile.close()

Index Error when using python to read BLAST output in csv format

Apologies for the long question, I have been trying to solve this bug but I cant work out what Im doing wrong! I have included an example of the data so you can see what Im working with.
I have data output from a BLAST search as below:
# BLASTN 2.2.29+
# Query: Cryptocephalus androgyne
# Database: SANfive
# Fields: query id subject id % identity alignment length mismatches gap opens q. start q. end s. start s. end evalue bit score
# 7 hits found
Cryptocephalus M00964:19:000000000-A4YV1:1:2110:23842:21326 99.6 250 1 0 125 374 250 1 1.00E-128 457
Cryptocephalus M00964:19:000000000-A4YV1:1:1112:19704:18005 85.37 246 36 0 90 335 246 1 4.00E-68 255
Cryptocephalus M00964:19:000000000-A4YV1:1:2106:14369:15227 77.42 248 50 3 200 444 245 1 3.00E-34 143
Cryptocephalus M00964:19:000000000-A4YV1:1:2102:5533:11928 78.1 137 30 0 3 139 114 250 2.00E-17 87.9
Cryptocephalus M00964:19:000000000-A4YV1:1:1110:28729:12868 81.55 103 19 0 38 140 104 2 6.00E-17 86.1
Cryptocephalus M00964:19:000000000-A4YV1:1:1113:11427:16440 78.74 127 27 0 3 129 124 250 6.00E-17 86.1
Cryptocephalus M00964:19:000000000-A4YV1:1:2110:12170:20594 78.26 115 25 0 3 117 102 216 1.00E-13 75
# BLASTN 2.2.29+
# Query: Cryptocephalus aureolus
# Database: SANfive
# Fields: query id subject id % identity alignment length mismatches gap opens q. start q. end s. start s. end evalue bit score
# 10 hits found
Cryptocephalus M00964:19:000000000-A4YV1:1:2111:20990:19930 97.2 250 7 0 119 368 250 1 1.00E-118 424
Cryptocephalus M00964:19:000000000-A4YV1:1:1105:20676:23942 86.89 206 27 0 5 210 209 4 7.00E-61 231
Cryptocephalus M00964:19:000000000-A4YV1:1:1113:6534:23125 97.74 133 3 0 1 133 133 1 3.00E-60 230
Cryptocephalus M00964:21:000000000-A4WJV:1:2104:11955:19015 89.58 144 15 0 512 655 1 144 2.00E-46 183
Cryptocephalus M00964:21:000000000-A4WJV:1:1109:14814:10240 88.28 128 15 0 83 210 11 138 2.00E-37 154
Cryptocephalus M00964:21:000000000-A4WJV:1:1105:4530:13833 79.81 208 42 0 3 210 211 4 6.00E-37 152
Cryptocephalus M00964:19:000000000-A4YV1:1:2108:13133:14967 98.7 77 1 0 1 77 77 1 2.00E-32 137
Cryptocephalus M00964:19:000000000-A4YV1:1:1109:14328:3682 100 60 0 0 596 655 251 192 1.00E-24 111
Cryptocephalus M00964:19:000000000-A4YV1:1:1105:19070:25181 100 53 0 0 1 53 53 1 8.00E-21 99
Cryptocephalus M00964:19:000000000-A4YV1:1:1109:20848:27419 100 28 0 0 1 28 28 1 6.00E-07 52.8
# BLASTN 2.2.29+
# Query: Cryptocephalus cynarae
# Database: SANfive
# Fields: query id subject id % identity alignment length mismatches gap opens q. start q. end s. start s. end evalue bit score
# 2 hits found
Cryptocephalus M00964:21:000000000-A4WJV:1:2107:12228:15885 90.86 175 16 0 418 592 4 178 5.00E-62 235
Cryptocephalus M00964:21:000000000-A4WJV:1:1110:20463:5044 84.52 168 26 0 110 277 191 24 2.00E-41 167
and I have saved this as a csv, again shown below
# BLASTN 2.2.29+,,,,,,,,,,,
# Query: Cryptocephalus androgyne,,,,,,,,,,,
# Database: SANfive,,,,,,,,,,,
# Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
# 7 hits found,,,,,,,,,,,
Cryptocephalus,M00964:19:000000000-A4YV1:1:2110:23842:21326,99.6,250,1,0,125,374,250,1,1.00E-128,457
Cryptocephalus,M00964:19:000000000-A4YV1:1:1112:19704:18005,85.37,246,36,0,90,335,246,1,4.00E-68,255
Cryptocephalus,M00964:19:000000000-A4YV1:1:2106:14369:15227,77.42,248,50,3,200,444,245,1,3.00E-34,143
Cryptocephalus,M00964:19:000000000-A4YV1:1:2102:5533:11928,78.1,137,30,0,3,139,114,250,2.00E-17,87.9
Cryptocephalus,M00964:19:000000000-A4YV1:1:1110:28729:12868,81.55,103,19,0,38,140,104,2,6.00E-17,86.1
Cryptocephalus,M00964:19:000000000-A4YV1:1:1113:11427:16440,78.74,127,27,0,3,129,124,250,6.00E-17,86.1
Cryptocephalus,M00964:19:000000000-A4YV1:1:2110:12170:20594,78.26,115,25,0,3,117,102,216,1.00E-13,75
# BLASTN 2.2.29+,,,,,,,,,,,
# Query: Cryptocephalus aureolus,,,,,,,,,,,
# Database: SANfive,,,,,,,,,,,
# Fields: query id, subject id, % identity, alignment length, mismatches, gap opens, q. start, q. end, s. start, s. end, evalue, bit score
# 10 hits found,,,,,,,,,,,
Cryptocephalus,M00964:19:000000000-A4YV1:1:2111:20990:19930,97.2,250,7,0,119,368,250,1,1.00E-118,424
Cryptocephalus,M00964:19:000000000-A4YV1:1:1105:20676:23942,86.89,206,27,0,5,210,209,4,7.00E-61,231
Cryptocephalus,M00964:19:000000000-A4YV1:1:1113:6534:23125,97.74,133,3,0,1,133,133,1,3.00E-60,230
Cryptocephalus,M00964:21:000000000-A4WJV:1:2104:11955:19015,89.58,144,15,0,512,655,1,144,2.00E-46,183
Cryptocephalus,M00964:21:000000000-A4WJV:1:1109:14814:10240,88.28,128,15,0,83,210,11,138,2.00E-37,154
Cryptocephalus,M00964:21:000000000-A4WJV:1:1105:4530:13833,79.81,208,42,0,3,210,211,4,6.00E-37,152
Cryptocephalus,M00964:19:000000000-A4YV1:1:2108:13133:14967,98.7,77,1,0,1,77,77,1,2.00E-32,137
Cryptocephalus,M00964:19:000000000-A4YV1:1:1109:14328:3682,100,60,0,0,596,655,251,192,1.00E-24,111
Cryptocephalus,M00964:19:000000000-A4YV1:1:1105:19070:25181,100,53,0,0,1,53,53,1,8.00E-21,99
Cryptocephalus,M00964:19:000000000-A4YV1:1:1109:20848:27419,100,28,0,0,1,28,28,1,6.00E-07,52.8
I have designed a short script that goes through the percentage identity and if it is above a threshold finds the queryID and adds it to a list before removing duplicates from the list.
import csv
from pylab import plot,show
#Making a function to see if a string is a number or not
def is_number(s):
try:
float(s)
return True
except ValueError:
return False
#Importing the CSV file, using sniffer to check the delimiters used
#In the first 1024 bytes
ImportFile = raw_input("What is the name of your import file? ")
csvfile = open(ImportFile, "rU")
dialect = csv.Sniffer().sniff(csvfile.read(1024))
csvfile.seek(0)
reader = csv.reader(csvfile, dialect)
#Finding species over 98%
Species98 = []
Species95to97 = []
Species90to94 = []
Species85to89 = []
Species80to84 = []
Species75to79 = []
SpeciesBelow74 = []
for line in reader:
if is_number(line[2])== True:
if float(line[2])>=98:
Species98.append(line[0])
elif 97>=float(line[2])>=95:
Species95to97.append(line[0])
elif 94>=float(line[2])>=90:
Species90to94.append(line[0])
elif 89>=float(line[2])>=85:
Species85to89.append(line[0])
elif 84>=float(line[2])>=80:
Species80to84.append(line[0])
elif 79>=float(line[2])>=75:
Species75to79.append(line[0])
elif float(line[2])<=74:
SpeciesBelow74.append(line[0])
def f7(seq):
seen = set()
seen_add = seen.add
return [ x for x in seq if x not in seen and not seen_add(x)]
Species98=f7(Species98)
print len(Species98), "species over 98"
Species95to97=f7(Species95to97) #removing duplicates
search_set = set().union(Species98)
Species95to97 = [x for x in Species95to97 if x not in search_set]
print len(Species95to97), "species between 95-97"
Species90to94=f7(Species90to94)
search_set = set().union(Species98, Species95to97)
Species90to94 = [x for x in Species90to94 if x not in search_set]
print len(Species90to94), "species between 90-94"
Species85to89=f7(Species85to89)
search_set = set().union(Species98, Species95to97, Species90to94)
Species85to89 = [x for x in Species85to89 if x not in search_set]
print len(Species85to89), "species between 85-89"
Species80to84=f7(Species80to84)
search_set = set().union(Species98, Species95to97, Species90to94, Species85to89)
Species80to84 = [x for x in Species80to84 if x not in search_set]
print len(Species80to84), "species between 80-84"
Species75to79=f7(Species75to79)
search_set = set().union(Species98, Species95to97, Species90to94, Species85to89,Species80to84)
Species75to79 = [x for x in Species75to79 if x not in search_set]
print len(Species75to79), "species between 75-79"
SpeciesBelow74=f7(SpeciesBelow74)
search_set = set().union(Species98, Species95to97, Species90to94, Species85to89,Species80to84, Species75to79)
SpeciesBelow74 = [x for x in SpeciesBelow74 if x not in search_set]
print len(SpeciesBelow74), "species below 74"
#Finding species 95-97%
The script works perfectly most of the time but every so often I get the error shown below
File "FindingSpeciesRepresentation.py", line 35, in <module>
if is_number(line[2])== "True":
IndexError: list index out of range
But if I change the script so it prints line[2] it prints all the identities as I would expect. Do you have any idea what could be going wrong? Again apologies for the wall of data.
This has been partly taken from my earlier question: Extracting BLAST output columns in CSV form with python

Categories