improve this very-simple dictionary generator in python - python

I'm trying to make a simple dict generator. It works but it isn't very functional yet.
I'd like to improve it by being able to change the max size of the output without touching the code.
letr='abcdefghijklmnopqrstuvwxyz'
for i in range(len(letr)):
t=letr[i]
print t
for t2 in letr:
print t+t2
for t3 in letr:
print t+t2+t3
for t4 in letr:
print t+t2+t3+t4
for t5 in letr:
print t+t2+t3+t4+t5

import itertools
def dict_gen(n):
letr = 'abcdefghijklmnopqrstuvwxyz'
return itertools.chain(''.join(j) for i in range(n)
for j in itertools.product(letr, repeat=i+1))
Usage:
for word in dict_gen(n): # replace n with the max word length you want
print word
Unlike some of the other answers this will include duplicates like your example ('aa', 'bb', etc).
dict_gen() will return a generator, but you can always just pass it into list() if you need to access elements by index:
>>> words = list(dict_gen(5))
>>> len(words) == 26 + 26**2 + 26**3 + 26**4 + 26**5 # verify correct length
True
>>> words[20:30] # transition from one letter to two letters
['u', 'v', 'w', 'x', 'y', 'z', 'aa', 'ab', 'ac', 'ad']
>>> words[-10:] # last 10 elements
['zzzzq', 'zzzzr', 'zzzzs', 'zzzzt', 'zzzzu', 'zzzzv', 'zzzzw', 'zzzzx', 'zzzzy', 'zzzzz']

letr = ''.join(chr(o) for o in range(ord('a'), ord('z') + 1))
import itertools
print [''.join(word) for word in itertools.permutations(letr, 5)]

Itertools is your best friend.
>>> import itertools
>>> gen = ("".join(i) for i in itertools.permutations(letr, 5))
>>> list(gen)[-10:]
['zyxwm', 'zyxwn', 'zyxwo', 'zyxwp', 'zyxwq', 'zyxwr', 'zyxws', 'zyxwt', 'zyxwu', 'zyxwv']
If you want to get all the permuations, you could write a generator yourself:
import itertools
def perms(seq):
for n in range(len(seq)+1):
for i in itertools.permutations(seq, n):
yield i
Check the Python documentation for itertools and generators for more info.

Related

generating list of every combination without duplicates

I would like to generate a list of combinations. I will try to simplify my problem to make it understandable.
We have 3 variables :
x : number of letters
k : number of groups
n : number of letters per group
I would like to generate using python a list of every possible combinations, without any duplicate knowing that : i don't care about the order of the groups and the order of the letters within a group.
As an example, with x = 4, k = 2, n = 2 :
# we start with 4 letters, we want to make 2 groups of 2 letters
letters = ['A','B','C','D']
# here would be a code that generate the list
# Here is the result that is very simple, only 3 combinations exist.
combos = [ ['AB', 'CD'], ['AC', 'BD'], ['AD', 'BC'] ]
Since I don't care about the order of or within the groups, and letters within a group, ['AB', 'CD'] and ['DC', 'BA'] is a duplicate.
This is a simplification of my real problem, which has those values : x = 12, k = 4, n = 3. I tried to use some functions from itertools, but with that many letters my computer freezes because it's too many combinations.
Another way of seeing the problem : you have 12 players, you want to make 4 teams of 3 players. What are all the possibilities ?
Could anyone help me to find an optimized solution to generate this list?
There will certainly be more sophisticated/efficient ways of doing this, but here's an approach that works in a reasonable amount of time for your example and should be easy enough to adapt for other cases.
It generates unique teams and unique combinations thereof, as per your specifications.
from itertools import combinations
# this assumes that team_size * team_num == len(players) is a given
team_size = 3
team_num = 4
players = list('ABCDEFGHIJKL')
unique_teams = [set(c) for c in combinations(players, team_size)]
def duplicate_player(combo):
"""Returns True if a player occurs in more than one team"""
return len(set.union(*combo)) < len(players)
result = (combo for combo in combinations(unique_teams, team_num) if not duplicate_player(combo))
result is a generator that can be iterated or turned into a list with list(result). On kaggle.com, it takes a minute or so to generate the whole list of all possible combinations (a total of 15400, in line with the computations by #beaker and #John Coleman in the comments). The teams are tuples of sets that look like this:
[({'A', 'B', 'C'}, {'D', 'E', 'F'}, {'G', 'H', 'I'}, {'J', 'K', 'L'}),
({'A', 'B', 'C'}, {'D', 'E', 'F'}, {'G', 'H', 'J'}, {'I', 'K', 'L'}),
({'A', 'B', 'C'}, {'D', 'E', 'F'}, {'G', 'H', 'K'}, {'I', 'J', 'L'}),
...
]
If you want, you can cast them into strings by calling ''.join() on each of them.
Another solution (players are numbered 0, 1, ...):
import itertools
def equipartitions(base_count: int, group_size: int):
if base_count % group_size != 0:
raise ValueError("group_count must divide base_count")
return set(_equipartitions(frozenset(range(base_count)), group_size))
def _equipartitions(base_set: frozenset, group_size: int):
if not base_set:
yield frozenset()
for combo in itertools.combinations(base_set, group_size):
for rest in _equipartitions(base_set.difference(frozenset(combo)), group_size):
yield frozenset({frozenset(combo), *rest})
all_combinations = [
[tuple(team) for team in combo]
for combo in equipartitions(12, 3)
]
print(all_combinations)
print(len(all_combinations))
And another:
import itertools
from typing import Iterable
def equipartitions(players: Iterable, team_size: int):
if len(players) % team_size != 0:
raise ValueError("group_count must divide base_count")
return _equipartitions(set(players), team_size)
def _equipartitions(players: set, team_size: int):
if not players:
yield []
return
first_player, *other_players = players
for other_team_members in itertools.combinations(other_players, team_size-1):
first_team = {first_player, *other_team_members}
for other_teams in _equipartitions(set(other_players) - set(first_team), team_size):
yield [first_team, *other_teams]
all_combinations = [
{''.join(sorted(team)) for team in combo} for combo in equipartitions(players='ABCDEFGHIJKL', team_size=3)
]
print(all_combinations)
print(len(all_combinations))
Firstly, you can use a list comprehension to give you all of the possible combinations (regardless of the duplicates):
comb = [(a,b) for a in letters for b in letters if a != b]
And, afterwards, you can use the sorted function to sort the tuples. After that, to remove the duplicates, you can convert all of the items to a set and then back to a list.
var = [tuple(sorted(sub)) for sub in comb]
var = list(set(var))
You could use the list comprehension approach, which has a time complexity of O(n*n-1), or you could use a more verbose way, but with a slightly better time complexity of O(n^2-n)/2:
comb = []
for first_letter_idx, _ in enumerate(letters):
for sec_letter_idx in range(first_letter_idx + 1, len(letters)):
comb.append(letters[first_letter_idx] + letters[sec_letter_idx])
print(comb)
comb2 = []
for first_letter_idx, _ in enumerate(comb):
for sec_letter_idx in range(first_letter_idx + 1, len(comb)):
if (comb[first_letter_idx][0] not in comb[sec_letter_idx]
and comb[first_letter_idx][1] not in comb[sec_letter_idx]):
comb2.append([comb[first_letter_idx], comb[sec_letter_idx]])
print(comb2)
This algorithm needs more work to handle dynamic inputs. Maybe with recursion.
Use combination from itertools
from itertools import combinations
x = list(combinations(['A','B','C','D'],2))
t = []
for i in (x):
t.append(i[0]+i[1]) # concatenating the strings and adding in a list
g = []
for i in range(0,len(t),2):
for j in range(i+1,len(t)):
g.append([t[i],t[j]])
break
print(g)

Counting triplets in a DNA-sequence

I want to make a code which counts all triplets in a sequence. I've read a plenty of posts so far, but none of them could help me.
This is my code:
def cnt(seq):
mydict = {}
if len(seq) % 3 == 0:
a = [x for x in seq]
for i in range(len(seq)//3):
b = ''.join(a[(0+3*i):(3+3*i)])
for base1 in ['A', 'T', 'G', 'C']:
for base2 in ['A', 'T', 'G', 'C']:
for base3 in ['A', 'T', 'G', 'C']:
triplet = base1 + base2 + base3
if b == triplet:
mydict[b] = 1
for key in sorted(mydict):
print("%s: %s" % (key, mydict[key]))
else:
print("Error")
Does Biopython provide a function to solve this problem?
EDIT:
Note that, for instance, in the sequence 'ATGAAG', 'TGA' or 'GAA' are not "valid" triplets, only 'ATG' and 'AAG', because in biology and bioinformatics, we read it 'ATG' and 'AAG', thats the information we need to translate it or whatever else.
You can imagine it as a sequence of words, for example "Hello world". The way we read it is "Hello" and "world", not "Hello", "ello ", "llo w",...
It took me a while to understand that you do not want to count the number of codons but the frequency of each codon. Your title is a bit misleading in this respect. Anyway, you can employ collections.Counter for your task:
from collections import Counter
def cnt(seq):
if len(seq) % 3 == 0:
#split list into codons of three
codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
#create Counter dictionary for it
codon_freq = Counter(codons)
#determine number of codons, should be len(seq) // 3
n = sum(codon_freq.values())
#print out all entries in an appealing form
for key in sorted(codon_freq):
print("{}: {} = {:5.2f}%".format(key, codon_freq[key], codon_freq[key] * 100 / n))
#or just the dictionary
#print(codon_freq)
else:
print("Error")
seq = "ATCGCAGAAATCCGCAGAATC"
cnt(seq)
Sample output:
AGA: 1 = 14.29%
ATC: 3 = 42.86%
CGC: 1 = 14.29%
GAA: 1 = 14.29%
GCA: 1 = 14.29%
You can use clever techniques, as suggested in the other answers, but I will build a solution starting from your code, which is almost working: Your problem is that every time you do mydict[b] = 1, you reset the count of b to 1.
A minimal fix
You could solve this by testing if the key is present, if not, create the entry in the dict, then increment the value, but there are more convenient tools in python.
A minimal change to your code would be to use a defaultdict(int) instead of a dict. Whenever a new key is encountered, it is assumed to have the associated default value for an int: 0. So you can increment the value instead of resetting:
from collections import defaultdict
def cnt(seq):
# instanciate a defaultdict that creates ints when necessary
mydict = defaultdict(int)
if len(seq) % 3 == 0:
a = [x for x in seq]
for i in range(len(seq)//3):
b = ''.join(a[(0+3*i):(3+3*i)])
for base1 in ['A', 'T', 'G', 'C']:
for base2 in ['A', 'T', 'G', 'C']:
for base3 in ['A', 'T', 'G', 'C']:
triplet = base1 + base2 + base3
if b == triplet:
# increment the existing count (or the default 0 value)
mydict[b] += 1
for key in sorted(mydict):
print("%s: %s" % (key, mydict[key]))
else:
print("Error")
It works as desired:
cnt("ACTGGCACT")
ACT: 2
GGC: 1
Some possible improvements
Now let's try to improve your code a bit.
First, as I wrote in the comments, let's avoid the un-necessary conversion of your sequence to a list, and use a better variable name for the currently counted codon:
from collections import defaultdict
def cnt(seq):
mydict = defaultdict(int)
if len(seq) % 3 == 0:
a = [x for x in seq]
for i in range(len(seq)//3):
codon = seq[(0+3*i):(3+3*i)]
for base1 in ['A', 'T', 'G', 'C']:
for base2 in ['A', 'T', 'G', 'C']:
for base3 in ['A', 'T', 'G', 'C']:
triplet = base1 + base2 + base3
if codon == triplet:
mydict[codon] += 1
for key in sorted(mydict):
print("%s: %s" % (key, mydict[key]))
else:
print("Error")
Now lets simplify the nested loop part, trying all possible codons, by generating in advance the set of possible codons:
from collections import defaultdict
from itertools import product
codons = {
"".join((base1, base2, base3))
for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}
def cnt(seq):
mydict = defaultdict(int)
if len(seq) % 3 == 0:
a = [x for x in seq]
for i in range(len(seq)//3):
codon = seq[(0+3*i):(3+3*i)]
if codon in codons:
mydict[codon] += 1
for key in sorted(mydict):
print("%s: %s" % (key, mydict[key]))
else:
print("Error")
Now, your code simply ignores the triplets that are not valid codons. Maybe you should instead issue a warning:
from collections import defaultdict
from itertools import product
codons = {
"".join((base1, base2, base3))
for (base1, base2, base3) in product("ACGT", "ACGT", "ACGT")}
def cnt(seq):
mydict = defaultdict(int)
if len(seq) % 3 == 0:
a = [x for x in seq]
for i in range(len(seq)//3):
codon = seq[(0+3*i):(3+3*i)]
# We count even invalid triplets
mydict[codon] += 1
# We display counts only for valid triplets
for codon in sorted(codons):
print("%s: %s" % (codon, mydict[codon]))
# We compute the set of invalid triplets:
# the keys that are not codons.
invalid = mydict.keys() - codons
# An empty set has value False in a test.
# We issue a warning if the set is not empty.
if invalid:
print("Warning! There are invalid triplets:")
print(", ".join(sorted(invalid)))
else:
print("Error")
A more fancy solution
Now a more fancy solution, using cytoolz (probably needs to be installed because it is not part of usual python distributions: pip3 install cytoolz, if you are using pip):
from collections import Counter
from itertools import product, repeat
from cytoolz import groupby, keymap, partition
# To make strings out of lists of strings
CAT = "".join
# The star "extracts" the elements from the result of repeat,
# so that product has 3 arguments, and not a single one
codons = {CAT(bases) for bases in product(*repeat("ACGT", 3))}
def cnt(seq):
# keymap(CAT, ...) transforms the keys (that are tuples of letters)
# into strings
# if len(seq) is not a multiple of 3, pad="-" will append "-"
# to complete the last triplet (which will be an invalid one)
codon_counts = keymap(CAT, Counter(partition(3, seq, pad="-")))
# separate encountered codons into valids and invalids
codons_by_validity = groupby(codons.__contains__, codon_counts.keys())
# get allows to provide a default value,
# in case one of the categories is not present
valids = codons_by_validity.get(True, [])
invalids = codons_by_validity.get(False, [])
# We display counts only for valid triplets
for codon in sorted(valids):
print("%s: %s" % (codon, codon_counts[codon]))
# We issue a warning if there are invalid codons.
if invalids:
print("Warning! There are invalid triplets:")
print(", ".join(sorted(invalids)))
Hope this helps.
You could do something like this:
from itertools import product
seq = 'ATGATG'
all_triplets = [seq[i:i+3] for i in range(len(seq)) if i <= len(seq)-3]
# this gives ['ATG', 'TGA', 'GAT', 'ATG']
# add more valid_triplets here
valid_triplets = ['ATG']
len([(i, j) for i, j in product(valid_triplets, all_triplets) if i==j])
Output:
2
It is unclear what output is expected. Here we use one of many grouping functions from more_itertools to build adjacent triplets or "codons".
import more_itertools as mit
seq = "ATGATG"
codons = ["".join(w) for w in mit.grouper(3, seq)]
codons
# ['ATG', 'ATG']
Count the number of codons by calling len.
len(triplets)
# 2
For more detailed analysis, consider splitting the problem into smaller functions that (1) extract codons and (2) compute occurrences.
Code
import collections as ct
def split_codons(seq):
"Return codons from a sequence; raise for bad sequences."
for w in mit.windowed(seq, n=3, step=3, fillvalue=""):
part = "".join(w)
if len(part) < 3:
raise ValueError(f"Sequence not divisible by 3. Got extra '{part}'.")
yield part
def count_codons(codons):
"""Return dictionary of codon occurences."""
dd = ct.defaultdict(int)
for i, c in enumerate(codons, 1):
dd[c] += 1
return {k: (v, 100 * v/i) for k, v in dd.items()}
Demo
>>> seq = "ATCGCAGAAATCCGCAGAATC"
>>> bad_seq = "ATCGCAGAAATCCGCAGAATCA"
>>> list(split_codons(seq))
['ATC', 'GCA', 'GAA', 'ATC', 'CGC', 'AGA', 'ATC']
>>> list(split_codons(bad_seq))
ValueError: Sequence not divisible by 3. Got extra 'A'.
>>> count_codons(split_codons(seq))
{'ATC': (3, 42.857142857142854),
'GCA': (1, 14.285714285714286),
'GAA': (1, 14.285714285714286),
'CGC': (1, 14.285714285714286),
'AGA': (1, 14.285714285714286)}

Fastest object to iterate a chars in a list of strings

I'm iterating through a list of words to find the most frequently used character between words (i.e. in list [hello, hank], 'h' counts as appearing twice while 'l' counts as appearing once.). A python list works fine, but I'm also looking into NumPy (dtype array?), and Pandas. It looks like Numpy may be the way to go, but are there other packages to consider? How else can I make this function faster?
Code in Question:
def mostCommon(guessed, li):
count = Counter()
for words in li:
for letters in set(words):
count[letters]+=1
return count.most_common()[:10]
Thanks.
Here's a NumPy approach using its views-concept -
def tabulate_occurrences(a): # Case sensitive
chars = np.asarray(a).view('S1')
valid_chars = chars[chars!='']
unqchars, count = np.unique(valid_chars, return_counts=1)
return pd.DataFrame({'char':unqchars, 'count':count})
def topNchars(a, N = 10): # Case insensitive
s = np.core.defchararray.lower(a).view('uint8')
unq, count = np.unique(s[s!=0], return_counts=1)
sidx = count.argsort()[-N:][::-1]
h = unq[sidx]
return [str(unichr(i)) for i in h]
Sample run -
In [322]: a = ['er', 'IS' , 'you', 'Is', 'is', 'er', 'IS']
In [323]: tabulate_occurrences(a) # Case sensitive
Out[323]:
char count
0 I 3
1 S 2
2 e 2
3 i 1
4 o 1
5 r 2
6 s 2
7 u 1
8 y 1
In [533]: topNchars(a, 5) # Case insensitive
Out[533]: ['s', 'i', 'r', 'e', 'y']
In [534]: topNchars(a, 10) # Case insensitive
Out[534]: ['s', 'i', 'r', 'e', 'y', 'u', 'o']
option 1
def pir1(li):
sets = [set(s) for s in li]
ul = np.array(list(set.union(*sets)))
us = np.apply_along_axis(set, 1, ul[:, None])
c = (sets >= us).sum(1)
a = c.argsort()[:-11:-1]
return ul[a]
option 2
def pir2(li):
return Counter(chain.from_iterable([list(set(i)) for i in li])).most_common(10)
Assume a list of words li
import pandas as pd
import numpy as np
from string import ascii_lowercase
li = pd.DataFrame(
np.random.choice(list(ascii_lowercase), (1000, 10))
).sum(1).tolist()
Including Divakar's and OP's functions
def tabulate_occurrences(a):
chars = np.asarray(a).view('S1')
valid_chars = chars[chars!='']
unqchars, count = np.unique(valid_chars, return_counts=1)
return pd.DataFrame({'char':unqchars, 'count':count})
def topNchars(a, N = 10):
s = np.core.defchararray.lower(a).view('uint8')
unq, count = np.unique(s[s!=0], return_counts=1)
sidx = count.argsort()[-N:][::-1]
h = unq[sidx]
return [str(chr(i)) for i in h]
def mostCommon(li):
count = Counter()
for words in li:
for letters in set(words):
count[letters]+=1
return count.most_common()[:10]
testing
import pandas as pd
import numpy as np
from string import ascii_lowercase
from timeit import timeit
results = pd.DataFrame(
index=pd.RangeIndex(5, 405, 5, name='No. Words'),
columns=pd.Index('pir1 pir2 mostCommon topNchars'.split(), name='Method'),
)
np.random.seed([3,1415])
for i in results.index:
li = pd.DataFrame(
np.random.choice(list(ascii_lowercase), (i, 10))
).sum(1).tolist()
for j in results.columns:
v = timeit(
'{}(li)'.format(j),
'from __main__ import {}, li'.format(j),
number=100
)
results.set_value(i, j, v)
ax = results.plot(title='Time Testing')
ax.set_ylabel('Time of 100 iterations')
Assuming you only want the most frequent character, where each character only counts a once per word:
>>> from itertools import chain
>>> l = ['hello', 'hank']
>>> chars = list(chain.from_iterable([list(set(word)) for word in l]))
>>> max(chars, key=chars.count)
'h'
Using max with list.count can be a lot faster than using Counter due to the C level implementation.
This looks like it is very fast already, and runs in O(n). The only real improvement opportunity that I see would be to parallelize this process by splitting li into multiple parts.
Here is a pure Python solution that uniqueifies each string, joins the sets, then counts the results (Using Divakar's example list)
>>> li=['er', 'IS' , 'you', 'Is', 'is', 'er', 'IS']
>>> Counter(e for sl in map(list, map(set, li)) for e in sl)
Counter({'I': 3, 'e': 2, 's': 2, 'S': 2, 'r': 2, 'o': 1, 'i': 1, 'u': 1, 'y': 1})
If you want upper and lower case to be counted as the same letter:
>>> Counter(e for sl in map(list, map(set, [s.lower() for s in li])) for e in sl)
Counter({'i': 4, 's': 4, 'e': 2, 'r': 2, 'o': 1, 'u': 1, 'y': 1})
Now let's time that:
from __future__ import print_function
from collections import Counter
import numpy as np
import pandas as pd
def dawg(li):
return Counter(e for sl in map(list, map(set, li)) for e in sl)
def nump(a):
chars = np.asarray(a).view('S1')
valid_chars = chars[chars!='']
unqchars, count = np.unique(valid_chars, return_counts=1)
return pd.DataFrame({'char':unqchars, 'count':count})
if __name__=='__main__':
import timeit
li=['er', 'IS' , 'you', 'Is', 'is', 'er', 'IS']
for f in (dawg, nump):
print(" ",f.__name__, timeit.timeit("f(li)", setup="from __main__ import f, li", number=100) )
Results:
dawg 0.00134205818176
nump 0.0347728729248
The Python solution is significantly faster in this case
Just do
counter = Counter(''.join(li))
most_common = counter.most_common()
and you're done

Python - Return top 5 words with highest frequency

As the title says, I need to write a code that returns a list of 5 words (from an input string) that have the highest frequency. This is what I have so far:
from collections import defaultdict
def top5_words(text):
tally = defaultdict(int)
words = text.split()
for word in words:
if word in tally:
tally[word] += 1
else:
tally[word] = 1
answer = sorted(tally, key=tally.get, reverse = True)
return(answer)
For example if you input: top5_words("one one was a racehorse two two was one too") it should return: ["one", "two", "was", "a", "racehorse"] but instead it returns: ['one', 'was', 'two', 'racehorse', 'too', 'a'] - does anyone know why this is?
EDIT:
This is what I've got now thanks to Anand S Kumar:
import collections
def top5_words(text):
counts = collections.Counter(text.split())
return [elem for elem, _ in sorted(counts.most_common(),key=lambda x:(-x[1], x[0]))[:5]]
You should use collections.Counter and then you can use its method - most_common() . Example -
import collections
def top5_words(text):
counts = collections.Counter(text.split())
return counts.most_common(5)
Please note, above returns a list of 5 tuples, and in each tuple, first element is the actual word and the second element the count of that word.
Demo -
>>> import collections
>>> def top5_words(text):
... counts = collections.Counter(text.split())
... return counts.most_common(5)
...
>>> top5_words("""As the title says, I need to write a code that returns a list of 5 words (from an input string) that have the highest frequency. This is what I have so far""")
[('that', 2), ('a', 2), ('I', 2), ('the', 2), ('have', 2)]
If you just want the elements and not the count , then you can also use list comprehension to take that information. Example -
import collections
def top5_words(text):
counts = collections.Counter(text.split())
return [elem for elem, _ in counts.most_common(5)]
Demo -
>>> import collections
>>> def top5_words(text):
... counts = collections.Counter(text.split())
... return [elem for elem, _ in counts.most_common(5)]
...
>>> top5_words("""As the title says, I need to write a code that returns a list of 5 words (from an input string) that have the highest frequency. This is what I have so far""")
['that', 'a', 'I', 'the', 'have']
For the new requirement from comments -
it seems there's still an issue when it comes to words with the same frequency, how would I get it to sort same frequency words alphabetically?
You can first get the list of all words and their counts and then use sorted such that sorted first sorts on the count and then on the element itself (so it gets sorted lexicographically, when the count is same). Example -
import collections
def top5_words(text):
counts = collections.Counter(text.lower().split())
return [elem for elem, _ in sorted(counts.most_common(),key=lambda x:(-x[1], x[0]))[:5]]
Demo -
>>> import collections
>>> def top5_words(text):
... counts = collections.Counter(text.lower().split())
... return [elem for elem, _ in sorted(counts.most_common(),key=lambda x:(-x[1], x[0]))[:5]]
...
>>> top5_words("""As the title says, I need to write a code that returns a list of 5 words (from an input string) that have the highest frequency. This is what I have so far""")
['a', 'have', 'i', 'that', 'the']

Combine elements of a list with all possible separators

I have the following requirement.
I have a list which say has 3 elements [X,Y,2]
What I would like to do is to generate strings with a separator (say "-") between (or not) each element. The order of the elements in the array should be preserved.
So the output would be:
'XY2'
'X-Y-2'
'X-Y2'
'XY-2'
is there an elegant way to this in python?
>>> import itertools
>>> for c in itertools.product(' -', repeat=2): print ('X%sY%s2' % c).replace(' ', '')
XY2
XY-2
X-Y2
X-Y-2
Or, with the elements coming from a python list:
import itertools
a = ['X', 'Y', 2]
for c in itertools.product(' -', repeat=2):
print ('%s%s%s%s%s' % (a[0],c[0],a[1],c[1],a[2])).replace(' ', '')
Or, in a slightly different style:
import itertools
a = ['X', 'Y', '2']
for c in itertools.product(' -', repeat=2):
print ( '%s'.join(a) % c ).replace(' ', '')
To capture the output to a list:
import itertools
a = ['X', 'Y', '2']
output = []
for c in itertools.product(' -', repeat=len(a)-1):
output.append( ('%s'.join(a) % c).replace(' ', '') )
print 'output=', output
A little more generalized but works for any number of separators and hopefully is easy to understand at each step:
import itertools
a = ['X', 'Y', '2']
all_separators = ['', '-', '+']
results = []
# this product puts all separators in all positions for len-1 (spaces between each element)
for this_separators in itertools.product(all_separators, repeat=len(a)-1):
this_result = []
for pair in itertools.izip_longest(a, this_separators, fillvalue=''):
for element in pair:
this_result.append(element)
# if you want it, here it is as a comprehension
# this_result = [element for pair
# in itertools.izip_longest(a, this_separators, fillvalue='')
# for element in pair]
this_result_string = ''.join(this_result) # check out join docs if it's new to you
results.append(this_result_string)
print results
>>> ['XY2', 'XY-2', 'XY+2', 'X-Y2', 'X-Y-2', 'X-Y+2', 'X+Y2', 'X+Y-2', 'X+Y+2']
These are the results for your case with just '' and '-' as separators:
>>> ['XY2', 'XY-2', 'X-Y2', 'X-Y-2']
If you want everything in one comprehension:
results = [''.join(element for pair
in itertools.izip_longest(a, this_separators, fillvalue='')
for element in pair)
for this_separators in itertools.product(all_separators, repeat=len(a)-1)]
I don't know if there is a function in itertool in order to do that. But i always think it's fun and a good exercice to do this kind of things. So there is a solution with recursive generator :
def generate(liste):
if len(liste) == 1:
yield [liste]
else:
for i in generate(liste[1:]):
yield [[liste[0]]]+i
yield [ [liste[0]]+i[0] ] + i[1:]
if __name__ == "__main__":
for i in generate (["X","Y","2"]):
print "test : " + str(i)
if len(i) == 1:
print "".join(i[0])
else:
print reduce(
lambda left, right : left + "".join(right),
i,
"")
Something like this?
from itertools import permutations
i = ["X","Y","2"]
for result in permutations(i, 3):
print "-".join(result)
Result:
X-Y-2
X-2-Y
Y-X-2
Y-2-X
2-X-Y
2-Y-X

Categories