Finding items that occur exactly once in an array - python

I have an 2 dimensional array. Each of the row vectors, in this case, is considered a quantity of interest. What I want to do is return all the rows that appear exactly once as one array, and all the rows that appear more than once as a second array.
For example, if the array was:
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
I would like to return two arrays:
nonsingles=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [5,1,6,0]]
singles= [[3,2,1,0], [4,4,1,0]]
It is important that the order stay preserved. The code I have written to do this is as follows:
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
arrayhash=map(tuple, array)
for x in arrayhash:
if (arrayhash.count(x)==1):
singles.append(x)
if (arrayhash.count(x)>1):
nonsingles.append(x)
nonsingles=array(nonsingles)
singles=array(singles)
return {'singles':singles, 'nonsingles':nonsingles}
Now, I am happy to say that this works, but unhappy to say that it is extremely slow, as a typical array i have is 30000(rows)x10 elements/row=300000 elements. Can anyone give me some tips about how to speed this up?? I apologize if this question is very simple, I am new to Python. Also, I am using Numpy/Scipy with Python 2.7, if that is any help.

In Python 2.7 or above, you can use collections.Counter to count the number of occurrences:
def unique_items(iterable):
tuples = map(tuple, iterable)
counts = collections.Counter(tuples)
unique = []
non_unique = []
for t in tuples:
if counts[t] == 1:
unique.append(t)
else:
non_unique.append(t)
return unique, non_unique

I think your problem is that you are doing an in test on a list. This has O(n) performance.
It should be faster to build a dict and then use that to figure out what to do with each row.
EDIT: The code had an unnecessary enumerate() in it; I stripped it out.
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
return {'singles':singles, 'nonsingles':nonsingles}
Here's a version that only returns unique rows:
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
already_seen = set()
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if row in already_seen:
continue
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
already_seen.add(row)
return {'singles':singles, 'nonsingles':nonsingles}
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
x = singles_nonsingles(a)
print("Array: " + str(a))
print(x)

The first return only the list of the single/no single arrays without repetitions, the second with repetitions
def comp (multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] > 1:
no_singles.append(list(k))
elif res[k] == 1:
singles.append(list(k))
return singles, no_singles
def count_w_repetitions(multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] == 1:
singles.append(list(k))
else:
for i in xrange(res[k]):
no_singles.append(list(k))
return singles, no_singles

from itertools import compress,imap
def has_all_unique(a):
return len(a) == len(frozenset(a))
uniq = map( has_all_unique,a)
singles = list(compress(a,uniq))
notuniq = imap(lambda x: not x,uniq)
nonsingles = list(compress(a,notuniq))

Related

How to link lists in order

I have multiple lists, the first index of each list are related the second as well so on and so fourth. I need a way of linking the order of these two lists together. so i have a list of teams (some are duplicate) i need an if statement that says: if theres a duplicate of this, then compare this to the duplicate and take the related value in the other list and choose the better one
import sys
import itertools
from itertools import islice
fileLocation = input("Input the file location of ScoreBoard: ")
T = []
N = []
L = []
timestamps = []
teamids = []
problemids = []
inputids = []
scores = []
dictionary = {}
amountOfLines = len(open('input1.txt').readlines())
with open('input1.txt') as input1:
for line in islice(input1, 2, amountOfLines):
parsed = line.strip().split()
timestamps.append(parsed[0])
teamids.append(parsed[1])
problemids.append(parsed[2])
inputids.append(parsed[3])
scores.append(parsed[4])
def checkIfDuplicates(teamids):
''' Check if given list contains any duplicates '''
if len(teamids) == len(set(teamids)):
return False
else:
return True
for i in teamids:
if checkIfDuplicates(i):
dictionary['team%s' % i] = {}
if dictionary < amountOfTeams:
dictionary['team%s' %]
for i in score:
dictionary[teamid][]
print(dictionary)
loop through each list item
delete item if duplicate
for i in list1:
for k in list2:
if i == k:
list.remove(i)

How to speed up combination algorithm?

Code below finds minimum items of list B that forms string A. lets assume A='hello world how are you doing' and B=['hello world how', 'hello are' ,'hello', 'hello are you doing']. Then since items with index 0 and 3 contains all words of string A, the answer will be 2.
I converted all the strings to integer to speed up the algorithm, but since there are larger and complicated test cases I need more optimized algorithm. I wondering how to speed up this algorithm.
import itertools
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()]
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
solved = False
for L in range(0, len(res_B)+1):
for subset in itertools.combinations(res_B, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}')
solved = True
break
if solved: break
I Had a logic mistake on remove_sub, no idea why it still worked
try cleaning the data and reducing as much items from b
import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
# my adding works on list of sets
for i in range(len(res_B)):
res_B[i] = set(res_B[i])
# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a = set(a)
b = []
for i in range(200):
size = np.random.randint(0,20)
b.append(set(np.random.choice(a,size)))
# till here, created a,b for larger data test
def f1(set_a, b):
solved = False
for L in range(0, len(b)+1):
for subset in it.combinations(b, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}','**************f1')
solved = True
break
if solved: break
def rare(b):
c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
items = dd(list) # dict where the key is num and value is list of index where this num exist in b
for i in range(len(b)):
c.update(b[i])
for num in b[i]:
items[num].append(i)
rare = set()
common = c.most_common() #return sorted list of tuples with a number and how many times it appear
for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
if common[-i][1] ==1:
rare.add(common[0])
continue
break
rare_items = {} # a set of all index that have rare number in them
for k in rare:
rare_items.update(items[k])
values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
for i in rare_items:
values_from_rare_items.update(b[i])
for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
b.pop(i)
return values_from_rare_items,b, len(rare_items)
#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it
def remove_sub(b):
to_pop = set()
t = copy.deepcopy(b)
for i in range(len(b)):
for j in range(len(t)):
if i ==j:
continue
if b[i] == t[j]:
to_pop.add(i)
continue
if b[i].issubset(t[j]):
to_pop.add(i)
if t[j].issubset(b[i]):
to_pop.add(j)
for i in reversed(sorted(to_pop)):
b.pop(i)
return b
def f2(set_a, b):
b1 = remove_sub(b)
values_from_rare_items,b2, num_rare_items = rare(b)
a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
solved = False
for L in range(0, len(b2)+1):
for subset in it.combinations(b2, L):
s = set(item for sublist in subset for item in sublist)
if a_without_rare.issubset(s):
length = L+num_rare_items
print(f'{length}', "*********f2")
solved = True
break
if solved: break
s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')
s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')
this is the out put
2 **************f1
0.16755199432373047 ********************f1 num_array
2 *********f2
0.09078240394592285 ******************f2 num_array
2 **************f1
0.0009989738464355469 ********************f1 your_data
2 *********f2
0.0009975433349609375 ******************f2 your_data
you can improve it more by taking all item that appear just few times, and treat them as if they appear once, in rare cases it will not be the real min number, but the time improvement is significant

Python custom comparator to sort a specific list

I have an input list like [1,2,2,1,6] the task in hand is to sort by the frequency. I have solved this question and am getting the output as [1,2,6].
But the caveat is that if two of the numbers have the same count like count(1) == count(2). So the desired output is [2,1,6]
then in the output array, 2 must come before 1 as 2 > 1.
So for the input [1,1,2,2,3,3] the output should be [3,2,1]. The counts are the same so they got sorted by their actual values.
This is what I did
input format:
number of Test cases
The list input.
def fun(l):
d = {}
for i in l:
if i in d:
d[i] += 1
else:
d[i] = 1
d1 = sorted(d,key = lambda k: d[k], reverse=True)
return d1
try:
test = int(input())
ans = []
while test:
l = [int(x) for x in input().split()]
ans.append(fun(l))
test -= 1
for i in ans:
for j in i:
print(j, end = " ")
print()
except:
pass
I think that this can help you. I added reverse parameter that is setting by default to True, because that gives the solution, but I wrote in the code where you can edit this as you may.
Here is the code:
from collections import defaultdict # To use a dictionary, but initialized with a default value
def fun(l, reverse = True):
d = defaultdict(int)
# Add count
for i in l:
d[i] += 1
# Create a dictionary where keys are values
new_d = defaultdict(list)
for key,value in d.items():
new_d[value].append(key)
# Get frequencies
list_freq = list(new_d.keys())
list_freq.sort(reverse = reverse) #YOU CAN CHANGE THIS
list_freq
# Add numbers in decreasing order by frequency
# If two integers have the same frequency, the greater number goes first
ordered_list = []
for number in list_freq:
values_number = new_d[number]
values_number.sort(reverse = reverse) # YOU CAN CHANGE THIS
ordered_list.extend(values_number)
return ordered_list
Examples:
l = [1,2,2,1,6]
fun(l)
#Output [2,1,6]
I hope this can help you!

Delete elements from list based on substring in Python

I have a huge list of strings where a couple of strings only differ in 2 or three characters like this:
ENSH-DFFEV1-5F
ENSH-DFFEV2-5F
ENSH-DFFEV3-5F
FVB.DFFVRV2-4T
FVB.DFFVRV3-4T
What I would like to do is to keep only those elements for which the number after the 'V' is the largest. From the above example I would like to have
ENSH-DFFEV3-5F
FVB.DFFVRV3-4T
Is there a simple way to do this in Python?
#stevieb is right, but anyway, I did the effort for you.
s = """
ENSH-DFFEV1-5F
ENSH-DFFEV2-5F
ENSH-DFFEV3-5F
FVB.DFFVRV2-4T
FVB.DFFVRV3-4T
""".split()
def custom_filter(s):
out = []
current_max = -1
for r in s:
v = int(r.rsplit('-', 1)[0][-1]) # <- you should probably edit this line to fit your data structure
if v > current_max:
current_max = v
out = []
if v == current_max:
out += [r]
return out
for e in custom_filter(s):
print e

Create multiple dictionaries from a single iterator in nested for loops

I have a nested list comprehension which has created a list of six lists of ~29,000 items. I'm trying to parse this list of final data, and create six separate dictionaries from it. Right now the code is very unpythonic, I need the right statement to properly accomplish the following:
1.) Create six dictionaries from a single statement.
2.) Scale to any length list, i.e., not hardcoding a counter shown as is.
I've run into multiple issues, and have tried the following:
1.) Using while loops
2.) Using break statements, will break out of the inner most loop, but then does not properly create other dictionaries. Also break statements set by a binary switch.
3.) if, else conditions for n number of indices, indices iterate from 1-29,000, then repeat.
Note the ellipses designate code omitted for brevity.
# Parse csv files for samples, creating a dictionary of key, value pairs and multiple lists.
with open('genes_1') as f:
cread_1 = list(csv.reader(f, delimiter = '\t'))
sample_1_values = [j for i, j in (sorted([x for x in {i: float(j)
for i, j in cread_1}.items()], key = lambda v: v[1]))]
sample_1_genes = [i for i, j in (sorted([x for x in {i: float(j)
for i, j in cread_1}.items()], key = lambda v: v[1]))]
...
# Compute row means.
mean_values = []
for i, (a, b, c, d, e, f) in enumerate(zip(sample_1_values, sample_2_values, sample_3_values, sample_4_values, sample_5_values, sample_6_values)):
mean_values.append((a + b + c + d + e + f)/6)
# Provide proper gene names for mean values and replace original data values by corresponding means.
sample_genes_list = [i for i in sample_1_genes, sample_2_genes, sample_3_genes, sample_4_genes, sample_5_genes, sample_6_genes]
sample_final_list = [sorted(zip(sg, mean_values)) for sg in sample_genes_list]
# Create multiple dictionaries from normalized values for each dataset.
class BreakIt(Exception): pass
try:
count = 1
for index, items in enumerate(sample_final_list):
sample_1_dict_normalized = {}
for index, (genes, values) in enumerate(items):
sample_1_dict_normalized[genes] = values
count = count + 1
if count == 29595:
raise BreakIt
except BreakIt:
pass
...
try:
count = 1
for index, items in enumerate(sample_final_list):
sample_6_dict_normalized = {}
for index, (genes, values) in enumerate(items):
if count > 147975:
sample_6_dict_normalized[genes] = values
count = count + 1
if count == 177570:
raise BreakIt
except BreakIt:
pass
# Pull expression values to qualify overexpressed proteins.
print 'ERG values:'
print 'Sample 1:', round(sample_1_dict_normalized.get('ERG'), 3)
print 'Sample 6:', round(sample_6_dict_normalized.get('ERG'), 3)
Your code is too long for me to give exact answer. I will answer very generally.
First, you are using enumerate for no reason. if you don't need both index and value, you probably don't need enumerate.
This part:
with open('genes.csv') as f:
cread_1 = list(csv.reader(f, delimiter = '\t'))
sample_1_dict = {i: float(j) for i, j in cread_1}
sample_1_list = [x for x in sample_1_dict.items()]
sample_1_values_sorted = sorted(sample_1_list, key=lambda expvalues: expvalues[1])
sample_1_genes = [i for i, j in sample_1_values_sorted]
sample_1_values = [j for i, j in sample_1_values_sorted]
sample_1_graph_raw = [float(j) for i, j in cread_1]
should be (a) using a list named samples and (b) much shorter, since you don't really need to extract all this information from sample_1_dict and move it around right now. It can be something like:
samples = [None] * 6
for k in range(6):
with open('genes.csv') as f: #but something specific to k
cread = list(csv.reader(f, delimiter = '\t'))
samples[k] = {i: float(j) for i, j in cread}
after that, calculating the sum and mean will be way more natural.
In this part:
class BreakIt(Exception): pass
try:
count = 1
for index, items in enumerate(sample_final_list):
sample_1_dict_normalized = {}
for index, (genes, values) in enumerate(items):
sample_1_dict_normalized[genes] = values
count = count + 1
if count == 29595:
raise BreakIt
except BreakIt:
pass
you should be (a) iterating of the samples list mentioned earlier, and (b) not using count at all, since you can iterate naturally over samples or sample[i].list or something like that.
Your code has several problems. You should put your code in functions that preferably do one thing each. Than you can call a function for each sample without repeating the same code six times (I assume that is what the ellipsis is hiding.). Give each function a self-describing name and a doc string that explains what it does. There is quite a bit unnecessary code. Some of this might become obvious once you have it in functions. Since functions take arguments you can hand in your 29595, for example.

Categories