How to speed up combination algorithm? - python

Code below finds minimum items of list B that forms string A. lets assume A='hello world how are you doing' and B=['hello world how', 'hello are' ,'hello', 'hello are you doing']. Then since items with index 0 and 3 contains all words of string A, the answer will be 2.
I converted all the strings to integer to speed up the algorithm, but since there are larger and complicated test cases I need more optimized algorithm. I wondering how to speed up this algorithm.
import itertools
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()]
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
solved = False
for L in range(0, len(res_B)+1):
for subset in itertools.combinations(res_B, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}')
solved = True
break
if solved: break

I Had a logic mistake on remove_sub, no idea why it still worked
try cleaning the data and reducing as much items from b
import itertools as it
import time
import numpy as np
from collections import Counter, defaultdict as dd
import copy
A='hello world how are you doing'
B=['hello world how', 'hello are' ,'hello', 'hello are you doing']
d = {}
res_A = [d.setdefault(word, len(d)+1) for word in A.lower().split()
mapping = dict(zip(A.split(), range(1, len(A) + 1)))
# find mappings of words in B
res_B = [[mapping[word] for word in s.split()] for s in B]
set_a = set(res_A)
# my adding works on list of sets
for i in range(len(res_B)):
res_B[i] = set(res_B[i])
# a is a list of numbers, b is a list of sets of numbers, we are trying to cover a using min items from b
a = np.random.randint(0,50,size = 30)
np_set_a = set(a)
b = []
for i in range(200):
size = np.random.randint(0,20)
b.append(set(np.random.choice(a,size)))
# till here, created a,b for larger data test
def f1(set_a, b):
solved = False
for L in range(0, len(b)+1):
for subset in it.combinations(b, L):
s = set(item for sublist in subset for item in sublist)
if set_a.issubset(s):
print(f'{L}','**************f1')
solved = True
break
if solved: break
def rare(b):
c = Counter() #a dict where the key is a num and the value is how many times this num appears on all b sets
items = dd(list) # dict where the key is num and value is list of index where this num exist in b
for i in range(len(b)):
c.update(b[i])
for num in b[i]:
items[num].append(i)
rare = set()
common = c.most_common() #return sorted list of tuples with a number and how many times it appear
for i in range(1,len(common)-1): #take all the numbers that appear only once on b, these items will have to be on the final combination so you can remove them from b and their numbers from a because those numbers are covered
if common[-i][1] ==1:
rare.add(common[0])
continue
break
rare_items = {} # a set of all index that have rare number in them
for k in rare:
rare_items.update(items[k])
values_from_rare_items = set() # a set of all the numbers in the items with the rare numbers
for i in rare_items:
values_from_rare_items.update(b[i])
for i in reversed(sorted(rare_items)): #remove from b all the items with rare numbers, because they have to be on the final combination, you dont need to check them
b.pop(i)
return values_from_rare_items,b, len(rare_items)
#check sets on b, if 2 are equal remove 1, if 1 is a subset of the other, remove it
def remove_sub(b):
to_pop = set()
t = copy.deepcopy(b)
for i in range(len(b)):
for j in range(len(t)):
if i ==j:
continue
if b[i] == t[j]:
to_pop.add(i)
continue
if b[i].issubset(t[j]):
to_pop.add(i)
if t[j].issubset(b[i]):
to_pop.add(j)
for i in reversed(sorted(to_pop)):
b.pop(i)
return b
def f2(set_a, b):
b1 = remove_sub(b)
values_from_rare_items,b2, num_rare_items = rare(b)
a_without_rare = set_a-values_from_rare_items #remove from a all the number you added with the rare unique numbers, they are already covered
solved = False
for L in range(0, len(b2)+1):
for subset in it.combinations(b2, L):
s = set(item for sublist in subset for item in sublist)
if a_without_rare.issubset(s):
length = L+num_rare_items
print(f'{length}', "*********f2")
solved = True
break
if solved: break
s = time.time()
f1(set_a,b)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,b)
print(time.time()-s,'******************f2')
s = time.time()
f1(set_a,res_B)
print(time.time()-s,'********************f1')
s = time.time()
f2(set_a,res_B)
print(time.time()-s,'******************f2')
this is the out put
2 **************f1
0.16755199432373047 ********************f1 num_array
2 *********f2
0.09078240394592285 ******************f2 num_array
2 **************f1
0.0009989738464355469 ********************f1 your_data
2 *********f2
0.0009975433349609375 ******************f2 your_data
you can improve it more by taking all item that appear just few times, and treat them as if they appear once, in rare cases it will not be the real min number, but the time improvement is significant

Related

For all sets in a list, extract the first number only

I have a list that looks like this:
b = [{'dg_12.942_ch_293','dg_22.38_ca_627'},
{'dg_12.651_cd_286','dg_14.293_ce_334'},
{'dg_17.42_cr_432','dg_18.064_cm_461','dg_18.85_cn_474','dg_20.975_cf_489'}]
I want to keep only the first number for each item in each set:
b = [{'12','22'},
{'12','14'},
{'17','18','18','20'}]
I then want to find the difference between the smallest and the largest number of each set and put it in a list, so in this case I would have:
b = [3,2,3]
Ugly and without any sanity check, but do the work.
import re
SEARCH_NUMBER_REGEX = re.compile("(\d+)")
def foo(dataset):
out = []
for entries in dataset:
numbers = []
for entry in entries:
# Search for the first number in the str
n = SEARCH_NUMBER_REGEX.search(entry).group(1)
n = int(n)
numbers.append(n)
# Sort the numbers and sustract the last one (largest)
# by the first one (smallest)
numbers.sort()
out.append(numbers[-1] - numbers[0])
return out
b = [
{'dg_12.942_ch_293', 'dg_22.38_ca_627'},
{'dg_12.651_cd_286', 'dg_14.293_ce_334'},
{'dg_17.42_cr_432', 'dg_18.064_cm_461', 'dg_18.85_cn_474', 'dg_20.975_cf_489'}
]
print(b)
# > [10, 2, 3]
This is giving o/p as [10,2,3]
(The difference b/w 22 and 12 is 10)
b = [{'12','22'},
{'12','14'},
{'17','18','18','20'}]
l = []
for i in b:
large ,small = -99, 99
for j in i:
j = int(j)
if large < j:
large = j
if small >j:
small = j
l.append(large - small)
print(l)
Here's yet another way to do it:
import re
ba = [{'dg_12.942_ch_293', 'dg_22.38_ca_627'},
{'dg_12.651_cd_286', 'dg_14.293_ce_334'},
{'dg_17.42_cr_432', 'dg_18.064_cm_461', 'dg_18.85_cn_474', 'dg_20.975_cf_489'}]
bb = []
for s in ba:
ns = sorted([int(re.search(r'(\d+)', ss)[0]) for ss in s])
bb.append(ns[-1]-ns[0])
print(bb)
Output:
[10, 2, 3]
Or, if you want to be ridiculous:
ba = [{'dg_12.942_ch_293', 'dg_22.38_ca_627'},
{'dg_12.651_cd_286', 'dg_14.293_ce_334'},
{'dg_17.42_cr_432', 'dg_18.064_cm_461', 'dg_18.85_cn_474', 'dg_20.975_cf_489'}]
bb = [(n := sorted([int(re.search(r'(\d+)', ss)[0]) for ss in s]))[-1]-n[0] for s in ba]
print(bb)
In your final product I see it was "[3,2,3]" but if I am understanding your question correct, it would be [10,2,3]. Either way the code I have below will atleast point you in the right direction (hopefully).
This code will iterate through each tuple in the list and split the str (since that is all we want to compare) and add them into lists. These numbers are then evaluated and subtracts the smallest number from the biggest number, and places it in a separate array. This "separate array" is the final one as shown in your question.
Goodluck - hopefully this helps!
import re
b = [('dg_12.942_ch_293','dg_22.38_ca_627'), ('dg_12.651_cd_286','dg_14.293_ce_334'), ('dg_17.42_cr_432','dg_18.064_cm_461','dg_18.85_cn_474','dg_20.975_cf_489')]
final_array = []
for tup in b:
x = tup
temp_array = []
for num in x:
split_number = re.search(r'\d+', num).group()
temp_array.append(split_number)
difference = int(max(temp_array)) - int(min(temp_array))
final_array.append(difference)
print(final_array)

Python - removing repeated letters in a string

Say I have a string in alphabetical order, based on the amount of times that a letter repeats.
Example: "BBBAADDC".
There are 3 B's, so they go at the start, 2 A's and 2 D's, so the A's go in front of the D's because they are in alphabetical order, and 1 C. Another example would be CCCCAAABBDDAB.
Note that there can be 4 letters in the middle somewhere (i.e. CCCC), as there could be 2 pairs of 2 letters.
However, let's say I can only have n letters in a row. For example, if n = 3 in the second example, then I would have to omit one "C" from the first substring of 4 C's, because there can only be a maximum of 3 of the same letters in a row.
Another example would be the string "CCCDDDAABC"; if n = 2, I would have to remove one C and one D to get the string CCDDAABC
Example input/output:
n=2: Input: AAABBCCCCDE, Output: AABBCCDE
n=4: Input: EEEEEFFFFGGG, Output: EEEEFFFFGGG
n=1: Input: XXYYZZ, Output: XYZ
How can I do this with Python? Thanks in advance!
This is what I have right now, although I'm not sure if it's on the right track. Here, z is the length of the string.
for k in range(z+1):
if final_string[k] == final_string[k+1] == final_string[k+2] == final_string[k+3]:
final_string = final_string.translate({ord(final_string[k]): None})
return final_string
Ok, based on your comment, you're either pre-sorting the string or it doesn't need to be sorted by the function you're trying to create. You can do this more easily with itertools.groupby():
import itertools
def max_seq(text, n=1):
result = []
for k, g in itertools.groupby(text):
result.extend(list(g)[:n])
return ''.join(result)
max_seq('AAABBCCCCDE', 2)
# 'AABBCCDE'
max_seq('EEEEEFFFFGGG', 4)
# 'EEEEFFFFGGG'
max_seq('XXYYZZ')
# 'XYZ'
max_seq('CCCDDDAABC', 2)
# 'CCDDAABC'
In each group g, it's expanded and then sliced until n elements (the [:n] part) so you get each letter at most n times in a row. If the same letter appears elsewhere, it's treated as an independent sequence when counting n in a row.
Edit: Here's a shorter version, which may also perform better for very long strings. And while we're using itertools, this one additionally utilises itertools.chain.from_iterable() to create the flattened list of letters. And since each of these is a generator, it's only evaluated/expanded at the last line:
import itertools
def max_seq(text, n=1):
sequences = (list(g)[:n] for _, g in itertools.groupby(text))
letters = itertools.chain.from_iterable(sequences)
return ''.join(letters)
hello = "hello frrriend"
def replacing() -> str:
global hello
j = 0
for i in hello:
if j == 0:
pass
else:
if i == prev:
hello = hello.replace(i, "")
prev = i
prev = i
j += 1
return hello
replacing()
looks a bit primal but i think it works, thats what i came up with on the go anyways , hope it helps :D
Here's my solution:
def snip_string(string, n):
list_string = list(string)
list_string.sort()
chars = set(string)
for char in chars:
while list_string.count(char) > n:
list_string.remove(char)
return ''.join(list_string)
Calling the function with various values for n gives the following output:
>>> string = "AAAABBBCCCDDD"
>>> snip_string(string, 1)
'ABCD'
>>> snip_string(string, 2)
'AABBCCDD'
>>> snip_string(string, 3)
'AAABBBCCCDDD'
>>>
Edit
Here is the updated version of my solution, which only removes characters if the group of repeated characters exceeds n.
import itertools
def snip_string(string, n):
groups = [list(g) for k, g in itertools.groupby(string)]
string_list = []
for group in groups:
while len(group) > n:
del group[-1]
string_list.extend(group)
return ''.join(string_list)
Output:
>>> string = "DDDAABBBBCCABCDE"
>>> snip_string(string, 3)
'DDDAABBBCCABCDE'
from itertools import groupby
n = 2
def rem(string):
out = "".join(["".join(list(g)[:n]) for _, g in groupby(string)])
print(out)
So this is the entire code for your question.
s = "AABBCCDDEEE"
s2 = "AAAABBBDDDDDDD"
s3 = "CCCCAAABBDDABBB"
s4 = "AAAAAAAA"
z = "AAABBCCCCDE"
With following test:
AABBCCDDEE
AABBDD
CCAABBDDABB
AA
AABBCCDE

Count of sub-strings that contain character X at least once. E.g Input: str = “abcd”, X = ‘b’ Output: 6

This question was asked in an exam but my code (given below) passed just 2 cases out of 7 cases.
Input Format : single line input seperated by comma
Input: str = “abcd,b”
Output: 6
“ab”, “abc”, “abcd”, “b”, “bc” and “bcd” are the required sub-strings.
def slicing(s, k, n):
loop_value = n - k + 1
res = []
for i in range(loop_value):
res.append(s[i: i + k])
return res
x, y = input().split(',')
n = len(x)
res1 = []
for i in range(1, n + 1):
res1 += slicing(x, i, n)
count = 0
for ele in res1:
if y in ele:
count += 1
print(count)
When the target string (ts) is found in the string S, you can compute the number of substrings containing that instance by multiplying the number of characters before the target by the number of characters after the target (plus one on each side).
This will cover all substrings that contain this instance of the target string leaving only the "after" part to analyse further, which you can do recursively.
def countsubs(S,ts):
if ts not in S: return 0 # shorter or no match
before,after = S.split(ts,1) # split on target
result = (len(before)+1)*(len(after)+1) # count for this instance
return result + countsubs(ts[1:]+after,ts) # recurse with right side
print(countsubs("abcd","b")) # 6
This will work for single character and multi-character targets and will run much faster than checking all combinations of substrings one by one.
Here is a simple solution without recursion:
def my_function(s):
l, target = s.split(',')
result = []
for i in range(len(l)):
for j in range(i+1, len(l)+1):
ss = l[i] + l[i+1:j]
if target in ss:
result.append(ss)
return f'count = {len(result)}, substrings = {result}'
print(my_function("abcd,b"))
#count = 6, substrings = ['ab', 'abc', 'abcd', 'b', 'bc', 'bcd']
Here you go, this should help
from itertools import combinations
output = []
initial = input('Enter string and needed letter seperated by commas: ') #Asking for input
list1 = initial.split(',') #splitting the input into two parts i.e the actual text and the letter we want common in output
text = list1[0]
final = [''.join(l) for i in range(len(text)) for l in combinations(text, i+1)] #this is the core part of our code, from this statement we get all the available combinations of the set of letters (all the way from 1 letter combinations to nth letter)
for i in final:
if 'b' in i:
output.append(i) #only outputting the results which have the required letter/phrase in it

Python 2D array with same values

I am a beginner programmer and I am doing a task for school. The task is to assign 4 constant variables and then use a code to work out the value. Each value has a corresponding letter and the program is asking the user to type in 5 numbers then the program will return the word. The code is the following:
array = [["L","N"], #define the 2d array, L=Letters, N=Numbers
["-","-"]] #line for space
a = 2#define the variables
b = 1
c = 7
d = 4
e = (a*b)+b#calcualtions
f = c+b
g = (d/a)-b
h = c*a
i = a+b+d
j = c-a
k = c-d*f
l = c+a
m = (c*a)-b
n = a*d
o = a+d-b
p = (c*d)-a*(b+d)
q = a*(c+(d-b))
r = (d*d)-b
s = r-f-g
array.append(["e",e])
array.append(["f",f])
array.append(["g",g])#append all the calculations
array.append(["h",h])
array.append(["i",i])
array.append(["j",j])
array.append(["k",k])
array.append(["l",l])
array.append(["m",m])
array.append(["n",n])
array.append(["o",o])
array.append(["p",p])
array.append(["q",q])
array.append(["r",r])
array.append(["s",s])
def answer():
len_row = len(array)
number_input = int(input("Enter number: "))
for i in range(len_row):
if number_input == (array[i][1]):
return array[i][0]
break
one_let = answer()
two_let = answer()
thr_let = answer()
fou_let = answer()
fiv_let = answer()
print(one_let,two_let,thr_let,fou_let,fiv_let)
The numbers that I am meant to put in are 6, 18,, 7, 8, and 3.
The word that prints is "spife" and the word that is meant to be printed is "spine". The problem is that there are two letters that have a variable of 8 and Python gets the first one only. Is there a way to print out the two seperate words but first with the first variable in a 2D array and second with the second 2D array? i.e spife then spine
Thank you for your help ahead, I am just a beginner! :)
Yes you can do it but is a bit tricky the secret is to use itertools.product on the list of letters that could have each of the five values.
First you need to use a better data structure such as a dict, (in this case a collection.defaltdict) to hold the letters that have some value. You can do this way:
import collections
import itertools
a = 2#define the variables
b = 1
c = 7
d = 4
e = (a*b)+b#calcualtions
f = c+b
g = (d/a)-b
h = c*a
i = a+b+d
j = c-a
k = c-d*f
l = c+a
m = (c*a)-b
n = a*d
o = a+d-b
p = (c*d)-a*(b+d)
q = a*(c+(d-b))
r = (d*d)-b
s = r-f-g
dat = collections.defaultdict(list)
for c in "abcdefghijklmnopqrs":
dat[eval(c)].append(c)
Now in dat you have a list of letters that match some number, for example
print(dat[6])
print(dat[18])
print(dat[7])
print(dat[8])
print(dat[3])
Outputs:
['s']
['p']
['i']
['f', 'n']
['e']
OK, then you need to change answerto return a list of letters, and collect the user input:
def answer():
number_input = int(input("Enter number: "))
return dat[number_input]
letts = [answer() for _ in range(5)] #collect five answers of the user
And the final magic is done here:
for s in map(lambda x: "".join(x),itertools.product(*letts)):
print(s)
Now if you are confused then study:
collections
collections.defaultdict
itertools
itertools.product
str.join

Finding items that occur exactly once in an array

I have an 2 dimensional array. Each of the row vectors, in this case, is considered a quantity of interest. What I want to do is return all the rows that appear exactly once as one array, and all the rows that appear more than once as a second array.
For example, if the array was:
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
I would like to return two arrays:
nonsingles=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [5,1,6,0]]
singles= [[3,2,1,0], [4,4,1,0]]
It is important that the order stay preserved. The code I have written to do this is as follows:
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
arrayhash=map(tuple, array)
for x in arrayhash:
if (arrayhash.count(x)==1):
singles.append(x)
if (arrayhash.count(x)>1):
nonsingles.append(x)
nonsingles=array(nonsingles)
singles=array(singles)
return {'singles':singles, 'nonsingles':nonsingles}
Now, I am happy to say that this works, but unhappy to say that it is extremely slow, as a typical array i have is 30000(rows)x10 elements/row=300000 elements. Can anyone give me some tips about how to speed this up?? I apologize if this question is very simple, I am new to Python. Also, I am using Numpy/Scipy with Python 2.7, if that is any help.
In Python 2.7 or above, you can use collections.Counter to count the number of occurrences:
def unique_items(iterable):
tuples = map(tuple, iterable)
counts = collections.Counter(tuples)
unique = []
non_unique = []
for t in tuples:
if counts[t] == 1:
unique.append(t)
else:
non_unique.append(t)
return unique, non_unique
I think your problem is that you are doing an in test on a list. This has O(n) performance.
It should be faster to build a dict and then use that to figure out what to do with each row.
EDIT: The code had an unnecessary enumerate() in it; I stripped it out.
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
return {'singles':singles, 'nonsingles':nonsingles}
Here's a version that only returns unique rows:
from collections import defaultdict
def singles_nonsingles(array):
#returns the elements that occur only once, and the elements
#that occur more than once in the array
singles=[]
nonsingles=[]
d = defaultdict(int)
already_seen = set()
t = [tuple(row) for row in array]
for row in t:
d[row] += 1
for row in t:
if row in already_seen:
continue
if d[row] == 1:
singles.append(row)
else:
nonsingles.append(row)
already_seen.add(row)
return {'singles':singles, 'nonsingles':nonsingles}
a=[[1,1,1,0], [1,1,1,0], [5,1,6,0], [3,2,1,0], [4,4,1,0], [5,1,6,0]]
x = singles_nonsingles(a)
print("Array: " + str(a))
print(x)
The first return only the list of the single/no single arrays without repetitions, the second with repetitions
def comp (multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] > 1:
no_singles.append(list(k))
elif res[k] == 1:
singles.append(list(k))
return singles, no_singles
def count_w_repetitions(multi):
from collections import defaultdict
res = defaultdict(int)
for vect in multi:
res[tuple(vect)] += 1
singles = []
no_singles = []
for k in res:
if res[k] == 1:
singles.append(list(k))
else:
for i in xrange(res[k]):
no_singles.append(list(k))
return singles, no_singles
from itertools import compress,imap
def has_all_unique(a):
return len(a) == len(frozenset(a))
uniq = map( has_all_unique,a)
singles = list(compress(a,uniq))
notuniq = imap(lambda x: not x,uniq)
nonsingles = list(compress(a,notuniq))

Categories