Python: merging tally data - python

Okay - I'm sure this has been answered here before but I can't find it....
My problem: I have a list of lists with this composition
0.2 A
0.1 A
0.3 A
0.3 B
0.2 C
0.5 C
My goal is to output the following:
0.6 A
0.3 B
0.7 C
In other words, I need to merge the data from multiple lines together.
Here's the code I'm using:
unique_percents = []
for line in percents:
new_percent = float(line[0])
for inner_line in percents:
if line[1] == inner_line[1]:
new_percent += float(inner_line[0])
else:
temp = []
temp.append(new_percent)
temp.append(line[1])
unique_percents.append(temp)
break
I think it should work, but it's not adding the percents up and still has the duplicates. Perhaps I'm not understanding how "break" works?
I'll also take suggestions of a better loop structure or algorithm to use. Thanks, David.

You want to use a dict, but collections.defaultdict can come in really handy here so that you don't have to worry about whether the key exists in the dict or not -- it just defaults to 0.0:
import collections
lines = [[0.2, 'A'], [0.1, 'A'], [0.3, 'A'], [0.3, 'B'], [0.2, 'C'], [0.5, 'C']]
amounts = collections.defaultdict(float)
for amount, letter in lines:
amounts[letter] += amount
for letter, amount in sorted(amounts.iteritems()):
print amount, letter

Try this out:
result = {}
for line in percents:
value, key = line
result[key] = result.get(key, 0) + float(value)

total = {}
data = [('0.1', 'A'), ('0.2', 'A'), ('.3', 'B'), ('.4', 'B'), ('-10', 'C')]
for amount, key in data:
total[key] = total.get(key, 0.0) + float(amount)
for key, amount in total.items():
print key, amount

Since all of the letter grades are grouped together, you can use itertools.groupby (and if not, just sort the list ahead of time to make them so):
data = [
[0.2, 'A'],
[0.1, 'A'],
[0.3, 'A'],
[0.3, 'B'],
[0.2, 'C'],
[0.5, 'C'],
]
from itertools import groupby
summary = dict((k, sum(i[0] for i in items))
for k,items in groupby(data, key=lambda x:x[1]))
print summary
Gives:
{'A': 0.60000000000000009, 'C': 0.69999999999999996, 'B': 0.29999999999999999}

If you have a list of lists like this:
[ [0.2, A], [0.1, A], ...] (in fact it looks like a list of tuples :)
res_dict = {}
for pair in lst:
letter = pair[1]
val = pair[0]
try:
res_dict[letter] += val
except KeyError:
res_dict[letter] = val
res_lst = [(val, letter) for letter, val in res_dict] # note, a list of tuples!

Using collections.defaultdict to tally values
(assuming text data in d):
>>> s=collections.defaultdict(float)
>>> for ln in d:
... v,k=ln.split()
... s[k] += float(v)
>>> s
defaultdict(<type 'float'>, {'A': 0.60000000000000009, 'C': 0.69999999999999996, 'B': 0.29999999999999999})
>>> ["%s %s" % (v,k) for k,v in s.iteritems()]
['0.6 A', '0.7 C', '0.3 B']
>>>

If you are using Python 3.1 or newer, you can use collections.Counter. Also I suggest using decimal.Decimal instead of floats:
# Counter requires python 3.1 and newer
from collections import Counter
from decimal import Decimal
lines = ["0.2 A", "0.1 A", "0.3 A", "0.3 B", "0.2 C", "0.5 C"]
results = Counter()
for line in lines:
percent, label = line.split()
results[label] += Decimal(percent)
print(results)
The result is:
Counter({'C': Decimal('0.7'), 'A': Decimal('0.6'), 'B': Decimal('0.3')})

This is verbose, but works:
# Python 2.7
lines = """0.2 A
0.1 A
0.3 A
0.3 B
0.2 C
0.5 C"""
lines = lines.split('\n')
#print(lines)
pctg2total = {}
thing2index = {}
index = 0
for line in lines:
pctg, thing = line.split()
pctg = float(pctg)
if thing not in thing2index:
thing2index[thing] = index
index = index + 1
pctg2total[thing] = pctg
else:
pctg2total[thing] = pctg2total[thing] + pctg
output = ((pctg2total[thing], thing) for thing in pctg2total)
# Let's sort by the first occurrence.
output = list(sorted(output, key = lambda thing: thing2index[thing[1]]))
print(output)
>>>
[(0.60000000000000009, 'A'), (0.29999999999999999, 'B'), (0.69999999999999996, 'C')]

letters = {}
for line in open("data", "r"):
lineStrip = line.strip().split()
percent = float(lineStrip[0])
letter = lineStrip[1]
if letter in letters:
letters[letter] = percent + letters[letter]
else:
letters[letter] = percent
for letter, percent in letters.items():
print letter, percent
A 0.6
C 0.7
B 0.3

Lets say we have this
data =[(b, float(a)) for a,b in
(line.split() for line in
"""
0.2 A
0.1 A
0.3 A
0.3 B
0.2 C
0.5 C""".splitlines()
if line)]
print data
# [('A', 0.2), ('A', 0.1), ('A', 0.3), ('B', 0.3), ('C', 0.2), ('C', 0.5)]
You can now just go though this and sum
counter = {}
for letter, val in data:
if letter in counter:
counter[letter]+=val
else:
counter[letter]=val
print counter.items()
Or group values together and use sum:
from itertools import groupby
# you want the name and the sum of the values
print [(name, sum(value for k,value in grp))
# from each group
for name, grp in
# where the group name of a item `p` is given by `p[0]`
groupby(sorted(data), key=lambda p:p[0])]

>>> from itertools import groupby, imap
>>> from operator import itemgetter
>>> data = [['0.2', 'A'], ['0.1', 'A'], ['0.3', 'A'], ['0.3', 'B'], ['0.2', 'C'], ['0.5', 'C']]
>>> # data = sorted(data, key=itemgetter(1))
...
>>> for k, g in groupby(data, key=itemgetter(1)):
... print sum(imap(float, imap(itemgetter(0), g))), k
...
0.6 A
0.3 B
0.7 C
>>>

Related

Python - Replace string characters and get all combinations

I want to map the dictionary d = {'R': ['a', 'g'], 'Y': ['c', 't']} to the string s = '----YY----RR----' to get the following output:
----cc----aa----
----cc----ag----
----cc----ga----
----cc----gg----
----ct----aa----
----ct----ag----
----ct----ga----
----ct----gg----
----tc----aa----
----tc----ag----
----tc----ga----
----tc----gg----
----tt----aa----
----tt----ag----
----tt----ga----
----tt----gg----
My (very) inefficient code is as below:
seqs = set()
for k,v in d.items():
for i in v:
i_seq = seq.replace(k,i,1)
for n in v:
n_seq = i_seq.replace(k,n,1)
for k2,v2 in d.items():
for i2 in v2:
i2_seq = n_seq.replace(k2,i2,1)
for n2 in v2:
n2_seq = i2_seq.replace(k2,n2,1)
if not 'Y' in n2_seq and not 'R' in n2_seq:
seqs.add(n2_seq)
What is a smarter way to do that?
A general approach without itertools:
def replaceCombinations(s, d):
string_set = set([s])
for i in range(len(s)):
if s[i] in d.keys():
new_set = set()
for c in string_set:
new_set.update(set(c.replace(s[i], new_char, 1) for new_char in d[s[i]]))
string_set = new_set
return string_set
string = "----YY----RR----"
d = {'R': ['a', 'g'], 'Y': ['c', 't']}
for c in sorted(replaceCombinations(string, d)):
print(c)
Use itertools.product:
from itertools import product
for p in product(product(d['R'],repeat=2),product(d['Y'],repeat=2)):
print(f'----{p[0][0]}{p[0][1]}-----{p[1][0]}{p[1][1]}-----')
----aa-----cc-----
----aa-----ct-----
----aa-----tc-----
----aa-----tt-----
----ag-----cc-----
----ag-----ct-----
----ag-----tc-----
----ag-----tt-----
----ga-----cc-----
----ga-----ct-----
----ga-----tc-----
----ga-----tt-----
----gg-----cc-----
----gg-----ct-----
----gg-----tc-----
----gg-----tt-----

find the first n most frequent characters in it

Given a string, you have to find the first n most frequent characters in it.
If there are two letters with the same frequency, then the alphabetically earlier value should be picked first:
string= "aabbccc"
n =2
list = []
#write your code here
char_dict = {}
for char in string:
if char not in char_dict:
char_dict[char] = 1
else:
char_dict[char] += 1
sorted_dict=sorted(char_dict.items(), key=lambda x: (x[1],x[0]))
sorted_dict = sorted_dict[-2:]
for key, value in sorted_dict:
list.append(key)
print(list)
My output is ['b', 'c'] but it should actually be c and a.
The problem is with the sorting. You need to sort by two fields in different directions (1 ascending and 1 descending). Change the sorted_dict 2 lines to:
sorted_dict = sorted(char_dict.items(), key=lambda x: (-x[1], x[0]))
sorted_dict = sorted_dict[:n]
btw: Avoid using python's keywords (such as list) as variable names. Name it myList or something similar.
Below is one of the perfect solution for your problem(with less code)
string=input()
n=int(input())
import collections
out=[collections.Counter(string).most_common(i+1)[i][0] for i in range(n)]
out.sort()
print(out)
I added the print outputs after the statement.
The code should be selfdescripting.
from collections import defaultdict
string= "aabbccc"
n = 2
result = defaultdict(int)
for char in string:
result[char] += 1
print(result) # defaultdict(<class 'int'>, {'b': 2, 'a': 2, 'c': 3})
ordered_result = sorted(result.items(), key=lambda x: (-x[1], x[0]))
print(ordered_result) # [('c', 3), ('a', 2), ('b', 2)]
ordered_list = [x[0] for x in ordered_result]
print(ordered_list[:n]) # ['c', 'a']
def char_frequency(string,n):
letter_freq = dict()
for letter in string:
if letter not in letter_freq.keys():
letter_freq[letter] = 1
else:
letter_freq[letter] += 1
list_of_tuples = sorted(letter_freq.items(), key=lambda x: (-x[1],x[0]))[:n]
print(list_of_tuples)
final_list = []
for tup in list_of_tuples:
final_list.append(tup[0])
return(sorted(final_list))
print(char_frequency("aabbccc",2))
string="good"
dictionary = {}
n=2
for char in string:
if(char in dictionary. keys()):
dictionary[char]+=1
else:
dictionary[char]=1
duplicate=[]
for char in dictionary:
if(dictionary[char] ==n):
print(char)
string="aabbccc"
unique_chars = list(set(string))
count_chars = map(lambda i: string.count(i), unique_chars)
order = sorted(zip(unique_chars, count_chars), key=lambda x: (-x[1], x[0]))
n=2
nth=order[:n]
the variable order is basically mapping each letter to its count
or in 1 line
sorted(zip(list(set(string)), map(lambda i: string.count(i),list(set(string)))), key=lambda x: (-x[1], x[0]))[:2]
The result is in the following format
[('c', 3), ('a', 2)]

How to calculate counts and frequencies for pairs in list of lists?

Bases refers to A,T,G and C
sample = [['CGG','ATT'],['GCGC','TAAA']]
# Note on fragility of data: Each element can only be made up only 2 of the 4 bases.
# [['CGG' ==> Only C and G,'ATT' ==> Only A and T],['GCGC'==> Only C and G,'TAAA' ==> Only T and A]]
# Elements like "ATGG" are not present in the data as the have more than 3 different types of bases
Consider the first pair : ['CGG','ATT']
Calculate frequency of each base in the pairs separately:
CGG => (C = 1/3, G = 2/3)
ATT => (A = 1/3, T = 2/3)
Calculate frequency of occurrence of combination of bases in the pairs. Here, the combinations are 'CA' and 'GT' (Notice, order of the base matters. It is not 'CA','AC','GT' and 'TG'. Just only 'CA' and 'GT').
Pairs => (CA = 1/3, GT = 2/3)
Calculate float(a) = (freq of Pairs) - ((freq of C in CGG) * (freq of A in ATT))
Eg in CA pairs, float (a) = (freq of CA pairs) - ((freq of C in CGG) * (freq of A in ATT))
Output a = (1/3) - ((1/3) * (1/3)) = 0.222222
Calculating "a" for any one combination (either CA pair or GT pair)
NOTE: If the pair is AAAC and CCCA, the freq of C would it be 1/4, i.e. it is the frequency of the base over one of the pairs
Calculate b
float (b) = (float(a)^2)/ (freq of C in CGG) * (freq G in CGG) * (freq A in ATT) * (freq of T in ATT)
Output b = 1
Do this for the entire list
Final Output a = [0.2222, - 0.125]
b = [1, 0.3333]
This code has been adapted from this answer. Please note that there are subtle differences in the two questions and they are NOT the same, in the approach to the problem.
However, I am unable to get this code to run. I get the following error:
for pair, count in i:
TypeError: 'int' object is not iterable
#Count individual bases.
sample4 = [['CGG','ATT'],['GCGC','TAAA']]
base_counter = Counter()
for i in enumerate(sample4):
for pair, count in i:
base_counter[pair[0]] += count
base_counter[pair[1]] += count
print base_counter
# Get the total for each base.
total_count = sum(base_counter.values())
# Convert counts to frequencies.
base_freq = {}
for base, count in base_counter.items():
base_freq[base] = count / total_count
# Not sure how to write a code to count the number of pairs (Step 2)
# Let's say the counts have been stored in pair_counts
# Examine a pair from the two unique pairs to calculate float_a.
for i in enumerate(sample4):
float(a) = (pair_count[pair] / sum(pair_count.values())) - (base_freq[pair[0]] * base_freq[pair[1]])
# Step 7!
for i in enumerate(sample4):
float_b = float_a / float(base_freq[0][0] * base_freq[0][1] * base_freq[1][0] * base_freq[1][1])
You are not really using Counter any different than a plain dict. Try something like the following approach:
>>> sample = [['CGG','ATT'],['GCGC','TAAA']]
>>> from collections import Counter
>>> base_counts = [[Counter(base) for base in sub] for sub in sample]
>>> base_counts
[[Counter({'G': 2, 'C': 1}), Counter({'T': 2, 'A': 1})], [Counter({'G': 2, 'C': 2}), Counter({'A': 3, 'T': 1})]]
Now you can continue with a functional approach using nested comprehensions to transform your data*:
>>> base_freqs = [[{k_v[0]:k_v[1]/len(bases[i]) for i,k_v in enumerate(count.items())} for count in counts]
... for counts, bases in zip(base_counts, sample)]
>>>
>>> base_freqs
[[{'G': 0.6666666666666666, 'C': 0.3333333333333333}, {'A': 0.3333333333333333, 'T': 0.6666666666666666}], [{'G': 0.5, 'C': 0.5}, {'A': 0.75, 'T': 0.25}]]
>>>
*Note, some people do not like big, nested comprehensions like that. I think it's fine as long as you are sticking to functional constructs and not mutating data structures inside your comprehensions. I actually find it very expressive. Others disagree vehemently. You can always unfold that code into nested for-loops.
Anyway, you can then work the same thing with the pairs. First:
>>> pairs = [list(zip(*bases)) for bases in sample]
>>> pairs
[[('C', 'A'), ('G', 'T'), ('G', 'T')], [('G', 'T'), ('C', 'A'), ('G', 'A'), ('C', 'A')]]
>>> pair_counts = [Counter(base_pair) for base_pair in pairs]
>>> pair_counts
[Counter({('G', 'T'): 2, ('C', 'A'): 1}), Counter({('C', 'A'): 2, ('G', 'T'): 1, ('G', 'A'): 1})]
>>>
Now, here it is easier to not use comprehensions so we don't have to calculate total more than once:
>>> pair_freq = []
>>> for count in pair_counts:
... total = sum(count.values())
... pair_freq.append({k:c/total for k,c in count.items()})
...
>>> pair_freq
[{('C', 'A'): 0.3333333333333333, ('G', 'T'): 0.6666666666666666}, {('G', 'T'): 0.25, ('C', 'A'): 0.5, ('G', 'A'): 0.25}]
>>>

py3k: mapping dictionary (string->number) into list (of strings)

Assume we have dictionary that translates strings into numbers.
How to reverse it into list ?
Let assume, we can fill not mapped numbers with empty string ''.
Here example how it works:
>>> dic_into_list({'x':0, 'z':2, 'w':3})
['x', '', 'z', 'w']
d = {'x':0, 'z':2, 'w':3}
lst = [""] * (max(d.values()) + 1)
for k, v in d.items():
lst[v] = k
print(lst)
prints
['x', '', 'z', 'w']
The simplest way is to flip the dict and then iterate up to the maximum value (now key) in the dict:
original = {'x':0, 'z':2, 'w':3}
d = dict((v, k) for k, v in original.iteritems())
print [d.get(i, '') for i in range(max(d) + 1)]
I share my current solution: (I look for shorter and cleared implementation in other posts):
def dic_into_list(dic):
maxindex = max([v for i,v in dic.items()])
dicrev = {num:name for name,num in dic.items()}
l=[]
for i in range(0,maxindex+1):
if i in dicrev:
l.append(dicrev[i])
else:
l.append('')
return l

TypeError: 'int' object is not iterable. Why am i getting this error? please help

def get_top_k(frequency, k):
temp = frequency
key = ""
tvalues = []
values = []
kk = int(k)
i = 0
for i in temp.keys():
key = i
num = [int(frequency[key])]
tvalues += num
tvalues = bubble_sort(tvalues)
i = 0
for i in kk:
num = [int(tvalues[i])]
values += num
print(values)
i = 0
result = {}
for i in kk:
result += {(str(temp[values[i]])):(int(values[i]))}
return result
Perhaps you meant
for i in range(kk):
a bit off topic, but:
for i in temp.keys():
key = i
num = [int(frequency[key])]
tvalues += num
should just be:
tvalues = temp.values()
example:
>>> D = {'a':1, 'b':2, 'c':3, 'd':4}
>>> D.keys()
['a', 'c', 'b', 'd']
>>> D.values()
[1, 3, 2, 4]
>>> D.items()
[('a', 1), ('c', 3), ('b', 2), ('d', 4)]
>>>
and it looks like your code could be changed to this:
>>> D = {'a':1, 'b':2, 'c':3, 'd':4}
>>> def get_top_k(D, k):
... return sorted(D.items(), reverse=True, key=lambda x: x[1])[:k]
...
>>> get_top_k(D, 2)
[('d', 4), ('c', 3)]
>>>
You have for i in kk and kk is just an integer. You can't iterate over an integer, you can only iterate over a sequence/iterable.
You probably want for i in range(kk) if you want to iterate from 0 to (kk-1).
Because kk = int(k)
kk is only one single number, not an array of numbers
What are you trying to do, for us to help you fixing it?

Categories