I have a String like this str = "aabcccdfffeeeeettaaaattiioccc"
I need output like this Result ={aa: 1;b:1;ccc:2;d:1;fff:1;eeeee:1;tt:2;aaaa:1;ii:1;o:1;ccc:1}
I have tried it like this so far:
def repeating_letters(the_string):
temp = []
count = 0
for i in range(len(the_string)):
if(the_string[i] == the_string[i]):
if(the_string[i] == the_string[i+1]):
temp = the_string[i]
# count = count+1
print(the_string[i])
if name__== "__main":
the_string = "aaafassskfahfioejwwa"
repeating_letters(the_string)
Hints
I would follow this steps:
Create a list where I will store my partial strings
Start iterating the string
Store the initial position and the current character
Keep iterating until the character is different
Store in the list the partial string from the initial position you stored until 1 less than the current position
Update the initial position to the current one and the current character
Use the list to create a collections.Counter
About your code, the_string[i] == the_string[i] will always be true.
SPOILER: solution
from collections import Counter
def repeating_letters(the_string):
partials = []
initial = 0
for i, character in enumerate(the_string):
if character == the_string[initial]:
continue
partials.append(the_string[initial:i])
initial = i
partials.append(the_string[initial:]) # Needed for the last partial string
return Counter(partials)
As #prahantrana mentions in a comment, getting the partials can be done in a one-liner with the groupby method from the itertools library.
from collections import Counter
from itertools import groupby
def repeating_letters(the_string):
return Counter(''.join(group) for _, group in groupby(the_string))
Or
from collections import Counter
from itertools import groupby
def repeating_letters(the_string):
return Counter(char*len(list(group)) for char, group in groupby(the_string))
I'm not sure which of them is faster.
from collections import Counter
from itertools import groupby
def splitter(text):
"""
text: str
return : frequency of continous characters
"""
string = [''.join(group) for key, group in groupby(text)]
return Counter(string)
l = 'aaaabcccdfffeeeeettfffaaaattiioccceeeeeeaaaa'
print(splitter(l))
output
Counter({'aaaa': 3, 'ccc': 2, 'fff': 2, 'tt': 2, 'b': 1, 'd': 1, 'eeeee': 1, 'ii': 1, 'o': 1, 'eeeeee': 1})
other way , coded method, not using any library
from collections import Counter
def function(string):
"""
string: str
return: frequency of continous same character
"""
res = []
tmp = []
if len(string)==0:
return Counter('')
val = string[0]
for i in range(1, len(string)):
if string[i] == val:
tmp.append(val)
val =string[i]
else:
tmp.append(val)
res.append(tmp)
tmp = []
val = string[i]
tmp.append(val)
res.append(tmp)
p = [''.join(i) for i in res]
return Counter(p)
l ='aaaabcccdfffeeeeettfffaaaattiioccceeeeeeaaaa'
print(function(l))
output
Counter({'aaaa': 3, 'ccc': 2, 'fff': 2, 'tt': 2, 'b': 1, 'd': 1, 'eeeee': 1, 'ii': 1, 'o': 1, 'eeeeee': 1})
Related
I have analyzed text data and now I want to count keywords that meet specific conditions (dates, category, etc), from the result of the analysis. The result of the analysis is over 50 thousand each, and I have 1500 conditions. Is there an efficient/fast way to extract keywords meeting the condition?
Below is the code I wrote and it is very time-consuming so I need some efficient way.
from collections import defaultdict
from typing import DefaultDict
# function for counting keywords
def count_words(top_rel: DefaultDict, top_pos: DefaultDict, top_neg: DefaultDict, data: pd.DataFrame):
if isinstance(data.loc[:, "3"].values[0], str):
for i, item in data.loc[:, "0":"3"].iterrows():
for pos_word in ast.literal_eval(item["1"]):
top_pos[pos_word] += 1
for neg_word in ast.literal_eval(item["2"]):
top_neg[neg_word] += 1
for rel_word in ast.literal_eval(item["3"]):
top_rel[rel_word] += 1
else:
for i, item in data.loc[:, "0":"3"].iterrows():
for pos_word in item["1"]:
top_pos[pos_word] += 1
for neg_word in item["2"]:
top_neg[neg_word] += 1
for rel_word in item["3"]:
top_rel[rel_word] += 1
return top_rel, top_pos, top_neg
# Create conditions
cat_ids = [subcats['id'] for subcats in cp.cat_config['cat'].values()] # cat ids in the category table
index = pd.MultiIndex.from_product([cat_ids, data.code.unique(), [start_date.strftime("%Y%m%d")],
data.target.unique(), [datetime.datetime.strptime(str(data._dates.unique()[0]),
"%Y%m%d").date().isocalendar()[1]]], names=["category_code", "region_code", "start_date",
"target", "year_week"]) # Cartesian product
top_word_id = pd.DataFrame(index=index).reset_index()
# Create defaultdict for each condition
top_word_id.loc[:, 'weekly_associated_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]
top_word_id.loc[:, 'weekly_positive_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]
top_word_id.loc[:, 'weekly_negative_top_word'] = [defaultdict(int) for _ in range(top_word_id.shape[0])]
# for specific periods,
while dates_queue:
date = dates_queue.popleft()
date_str = date.strftime("%Y%m%d.tsv")
data = pd.read_csv(PurePath("../results", date_str), sep='\t', engine='python', encoding='utf-8')
for i, item in top_word_id.iterrows(): # for each condition
# find data matched to the condition
id = item.loc["category_code"]
target = item.loc['target']
code = item.loc['region_code']
category_data = data[data.loc[:, id] == 1]
if category_data.shape[0] == 0:
continue
temp = category_data[(category_data.loc[:, 'target'] == target) & (category_data.loc[:, 'code'] == code)]
if temp.shape[0] == 0:
continue
top_pos, top_neg, top_rel = count_words(top_word_id.iloc[i, 6], top_word_id.iloc[i, 7], top_word_id.iloc[i, 8], data)
top_word_id.at[i, "weekly_associated_top_word"] = rel
top_word_id.at[i, "weekly_positive_top_word"] = pos
top_word_id.at[i, "weekly_negative_top_word"] = neg
EDIT
I really want to show you a sample, but it is too large and Korean language, I don't think you can get it. Instead, I illustrated the pseudo-code of the logic.
Input
data (pd.DataFrame): The input is collection of documents in a day. It has columns named target, category and code. Also, the data contains columns named 0, 1, 2, 3. Each element is a list of words. (e.g. data.loc[0, "0"] = ['a', 'b', 'c'], data.loc[0, "1"] = ['hello', 'world', '.'])
top_word_id (pd.DataFrame): Each row of the DataFrame represents each condition.
Algorithm: What I want is to find the row of the data where the row meets some specific conditions (i.e. target, category and code). A condition is each row of tow_word_id as I mentioned before.
Output: Let's say I want to find the data which meets the condition and the condition is j-th row of top_word_id. The number of the data where meets the condition are 2 which are i1 and i2 of the data. So I want to aggregate the word frequency of i1 and i2 of the data. The result of the aggregation must be kept because I want to aggregate the word frequencies of the documents of today and tomorrow.
In the simple use case, given an iterable, you can use collections.Counter object, https://docs.python.org/3/library/collections.html#collections.Counter e.g.
>>> from collections import Counter
>>> mylist = [1,2,3,3,2,1]
>>> Counter(mylist)
Counter({1: 2, 2: 2, 3: 2})
Given a string:
>>> text = "This is a sentence with repeated words words words in the sentence"
>>> tokenized_text = text.split()
>>> Counter(tokenized_text)
Counter({'This': 1,
'is': 1,
'a': 1,
'sentence': 2,
'with': 1,
'repeated': 1,
'words': 3,
'in': 1,
'the': 1})
To update a counter:
>> counter = Counter()
>>> counter.update(tokenized_text_1) # assuming tokenized text is an iterable of strings.
>>> counter.update(tokenized_text_2)
I achieved efficient and fast logic by utilizing collections.Counter, Cython and multiprocessing.Pool. I replace the counting part with Counter and utilizing Cython and multiprocessing.Pool for efficiency.
The below is the entire code:
from collections import defaultdict, Counter
from typing import DefaultDict
def count_words(top_pos: DefaultDict, top_neg: DefaultDict, top_rel: DefaultDict, data: pd.DataFrame):
print(data)
if isinstance(data.loc[:, "3"].values[0], str):
data_pos = data.loc[:, "1"].apply(lambda x: Counter(ast.literal_eval(x)))
data_neg = data.loc[:, "2"].apply(lambda x: Counter(ast.literal_eval(x)))
data_rel = data.loc[:, "3"].apply(lambda x: Counter(ast.literal_eval(x)))
print(data_pos)
print(data_neg)
print(data_rel)
for item in data_pos:
for k, v in item.items():
top_pos[k] += v
for item in data_neg:
for k, v in item.items():
top_neg[k] += v
for item in data_rel:
for k, v in item.items():
top_rel[k] += v
elif isinstance(data.loc[:, "3"].values[0], list):
print(data_pos)
print(data_neg)
print(data_rel)
data_pos = data.loc[:, "1"].apply(lambda x: Counter(x))
data_neg = data.loc[:, "2"].apply(lambda x: Counter(x))
data_rel = data.loc[:, "3"].apply(lambda x: Counter(x))
for item in data_pos:
for k, v in item.items():
top_pos[k] += v
for item in data_neg.items():
for k, v in item.items():
top_neg[k] += v
for item in data_rel.items():
for k, v in item.items():
top_rel[k] += v
else:
raise ValueError("The type must be either list or str")
return top_pos, top_neg, top_rel
def test(data, top_word_id):
for i, item in top_word_id.iterrows():
id = item.loc["category_code"]
target = item.loc['target']
code = item.loc['region_code']
category_data = data[data.loc[:, id] == 1]
if category_data.shape[0] == 0:
continue
temp = category_data[(category_data.loc[:, 'target'] == target) & (category_data.loc[:, 'code'] == code)]
if temp.shape[0] == 0:
continue
top_pos, top_neg, top_rel = count_words(top_word_id.loc[i, "weekly_positive_top_word"], top_word_id.loc[i, "weekly_negative_top_word"], top_word_id.loc[i, "weekly_associated_top_word"], data)
top_word_id.at[i, "weekly_associated_top_word"] = top_rel
top_word_id.at[i, "weekly_positive_top_word"] = top_pos
top_word_id.at[i, "weekly_negative_top_word"] = top_neg
return top_word_id
from multiprocessing import Pool, cpu_count
from contextlib import contextmanager
import numpy as np
#contextmanager
def poolcontext(*args, **kwargs):
try:
pool = Pool(*args, **kwargs)
yield pool
finally:
pool.terminate()
def parallelize_aggregation(data, top_word_id, func):
num_cores = cpu_count()
df_split = np.array_split(top_word_id, num_cores, axis=0)
with poolcontext(processes=num_cores) as pool:
results = pool.starmap(test, zip([data for _ in range(num_cores)], df_split))
return results
parallelize_aggregation(data, top_word_id, aggregate.test)
The below table illustrates times of the codes:
Code
Times
Cython (the code in the question)
4749s
Cython + Counter
3066s
Cython + Counter + multiprocessing.Pool
10s
Hello I asked this question previously and I wanted to adjust the code that I have now. I want to adjust this code so that if a letter is not present in a text string it still returns the value 0 to it assigned.
count = {}
for l in text.lower():
if l in let:
if l in count.keys():
count[l] += 1
else:
count[l] = 1
return count
It currently returns this:
example = "Sample String"
print(func(example, "sao")
{'s': 2, 'a' : 1}
This would be my desired output
example = "Sample String"
print(func(example, "sao"))
{'s': 2, 'a' : 1, 'o' :0}
If you don't mind using tools designed especially for your purpose, then the following will do:
from collections import Counter
def myfunc(inp, vals):
c = Counter(inp)
​return {e: c[e] for e in vals}
s = 'Sample String'
print(myfunc(s, 'sao')
Otherwise you can explicitly set all missing values in your functions.
def func(inp, vals):
count = {e:0 for e in vals}
for s in inp:
if s in count:
count[s] += 1
return count
# create a function
def stringFunc(string, letters):
# convert string of letters to a list of letters
letter_list = list(letters)
# dictionary comprehension to count the number of times a letter is in the string
d = {letter: string.lower().count(letter) for letter in letter_list}
return d
stringFunc('Hello World', 'lohdx')
# {'l': 3, 'o': 2, 'h': 1, 'd': 1, 'x': 0}
You can use a Dict Comprehensions and str.count:
def count_letters(text, letters):
lower_text = text.lower()
return {c: lower_text.count(c) for c in letters}
print(count_letters("Sample String", "sao"))
result: {'s': 2, 'a': 1, 'o': 0}
You can use collections.Counter and obtain character counts via the get method:
from collections import Counter
def func(string, chars):
counts = Counter(string.lower())
return {c: counts.get(c, 0) for c in chars}
I'm trying to write a code that does the following:
Takes a number of strings as input
Splits each string into two contiguous substrings of equal length
Returns the minimum number of characters to change to make the two substrings into anagrams of one another (if it's not possible, it must return -1).
Sample Input
6
aaabbb
ab
abc
mnop
xyyx
xaxbbbxx
Sample Output
3
1
-1
2
0
1
For a more detailed explanation about the problem, kindly check this link (no login or sign-up needed).
I've approached the solution pretty well, but it seems like I'm not getting something right, my output is usually a bit greater or smaller than what's expected, and I really don't know what's causing the problem. Here's my code:
n = int(input())
user_input = []
for k in range(n):
user_input.append(input())
results = []
for i in user_input:
if len(list(i))%2 == 0:
left = i[:len(list(i))//2]
right = i[len(list(i))//2:]
left_dict = dict((letter,left.count(letter)) for letter in set(left))
right_dict = dict((letter,right.count(letter)) for letter in set(right))
if left_dict == right_dict:
results.append(0)
else:
shared_items = {k: left_dict[k] for k in left_dict if k in right_dict and left_dict[k] == right_dict[k]}
results.append(len(left) - len(shared_items))
else:
results.append(-1)
print(results)
I appreciate any help in advance.
You started great, with calculating the count for each character in both the substrings, but you never used this power technically.
In this statement:
shared_items = {k: left_dict[k] for k in left_dict if k in right_dict and left_dict[k] == right_dict[k]}
you just calculate items that are in both the dictionary and have same count:
eg. in your 6th testcase:
xaxbbbxx
left_dict will be {'b': 1, 'a': 1, 'x': 2}
right_dict will be {'b': 2, 'x': 2}
and shared_item the way you calculate will give you: {'x':2}
But this doesn't correctly list all the items that are shared.
The correct no. of shared_items should be : {'x':2, 'b':1}
So for that,
What we could then do is calculate the minimum of the item quantities common in the left_dict and right_dict.
i.e. min(left_dict[k],right_dict[k])
the result.append statement will also change accordingly:
else:
shared_items = {k:min(left_dict[k],right_dict[k]) for k in left_dict if k in right_dict}
results.append(len(left)-sum(shared_items.values()))
Full execution:
n = int(input())
user_input = []
for k in range(n):
user_input.append(input())
results = []
for i in user_input:
if len(list(i))%2 == 0:
left = i[:len(list(i))//2]
right = i[len(list(i))//2:]
left_dict = dict((letter,left.count(letter)) for letter in set(left))
right_dict = dict((letter,right.count(letter)) for letter in set(right))
if left_dict == right_dict:
results.append(0)
else:
shared_items = {k:min(left_dict[k],right_dict[k]) for k in left_dict if k in right_dict}
results.append(len(left)-sum(shared_items.values()))
else:
results.append(-1)
print(results)
Input:
6
aaabbb
ab
abc
mnop
xyyx
xaxbbbxx
Output:
[3, 1, -1, 2, 0, 1]
which you could then ofcourse use and print '\n'.join(results) to get the output in the required format.
Two words are anagrams if the same letters appear with the same occurrencies.
from collections import Counter
sl = ["aaabbb", "ab", "abc", "mnop", "xyyx", "xaxbbbxx"]
def f(s):
if len(s)%2 != 0:
return -1
a = s[:len(s)//2]
b = s[len(s)//2:]
print(Counter(b) - Counter(a))
return sum( (Counter(b) - Counter(a)).values() )
list(map(f, sl))
Counter({'b': 3})
Counter({'b': 1})
Counter({'o': 1, 'p': 1})
Counter()
Counter({'b': 1})
[3, 1, -1, 2, 0, 1]
I'm trying to create a program where if you input a word, it will print out each letter of the word and how many times the letter appears in that word.
Eg; when I input "aaaarggh", the output should be "a 4 r 1 g 2 h 1".
def compressed (word):
count = 0
index = 0
while index < len(word):
letter = word[index]
for letter in word:
index = index + 1
count = count + 1
print(letter, count)
break
print("Enter a word:")
word = input()
compressed(word)
So far it just prints out each letter and position in the word.
Any help appreciated, thank you!
(no using dict method)
Just type (for Python 2.7+):
import collections
dict(collections.Counter('aaaarggh'))
having:
{'a': 4, 'g': 2, 'h': 1, 'r': 1}
a="aaaarggh"
d={}
for char in set(a):
d[char]=a.count(char)
print(d)
output
{'a': 4, 'h': 1, 'r': 1, 'g': 2}
try this, You can use counter it will return dict type
from collections import Counter
print(Counter("aaaarggh"))
One way of implementing it using a dict:
def compressed(word):
letters = dict()
for c in word:
letters[c] = letters.get(c, 0) + 1
for key, value in letters.items():
print(f'{value}{key}', end=' ')
As others have suggested, you can do this easily with a dict !
test_input = "aaaarggh"
def compressed (word):
letter_dict = {}
for letter in test_input:
if letter not in letter_dict:
letter_dict[letter] = 1
else:
letter_dict[letter] = letter_dict[letter]+1
return letter_dict
print(compressed(test_input))
Outputs:
{'a': 4, 'r': 1, 'g': 2, 'h': 1}
Counter is concise. But here's an alternative using defaultdict, which is a subclass of dict.
from collections import defaultdict
test_input = "aaaarggh"
d = defaultdict(int)
for letter in test_input:
d[letter] += 1
https://docs.python.org/3.6/library/collections.html#defaultdict-examples
def counter(word):
dic ={}
for i in [*word]:
counter = word.count(i)
d={i:counter}
dic.update(d)
return dic
counter("aaaarggh")
So I am trying to implement code that will count the next letter in a sentence, using python.
so for instance,
"""So I am trying to implement code that will count the next letter in a sentence, using
python"""
most common letters one after the other
for 's'
'o' :1
'e' :1
for 'o'
' ' :1
'd' :1
'u' :1
'n' :1
I think you get the idea
I already have written code for counting letters prior
def count_letters(word, char):
count = 0
for c in word:
if char == c:
count += 1
return count
As you can see this just counts for letters, but not the next letter. can someone give me a hand on this one?
from collections import Counter, defaultdict
counts = defaultdict(Counter)
s = """So I am trying to implement code that will count the next letter in a sentence, using
python""".lower()
for c1, c2 in zip(s, s[1:]):
counts[c1][c2] += 1
(apart from being simpler, this should be significantly faster than pault's answer by not iterating over the string for every letter)
Concepts to google that aren't named in the code:
for c1, c2 in ... (namely the fact that there are two variables): tuple unpacking
s[1:]: slicing. Basically this is a copy of the string after the first character.
Here is a relatively terse way to do it:
from itertools import groupby
from collections import Counter
def countTransitionFrequencies(text):
prevNext = list(zip(text[:-1], text[1:]))
prevNext.sort(key = lambda pn: pn[0])
transitions = groupby(prevNext, lambda pn: pn[0])
freqs = map(
lambda kts: (kts[0], Counter(map(lambda kv: kv[1], kts[1]))),
transitions
)
return freqs
Explanation:
zip creates list of pairs with (previous, next) characters
The pairs are sorted and grouped by the previous character
The frequencies of the next characters (extracted from pairs by kv[1]) are then counted using Counter.
Sorting is not really necessary, but unfortunately, this is how the provided groupby works.
An example:
for k, v in countTransitionFrequencies("hello world"):
print("%r -> %r" % (k, v))
This prints:
' ' -> Counter({'w': 1})
'e' -> Counter({'l': 1})
'h' -> Counter({'e': 1})
'l' -> Counter({'l': 1, 'o': 1, 'd': 1})
'o' -> Counter({' ': 1, 'r': 1})
'r' -> Counter({'l': 1})
'w' -> Counter({'o': 1})
Here's a way using collections.Counter:
Suppose the string you provided was stored in a variable s.
First we iterate over the set of all lower case letters in s. We do this by making another string s_lower which will convert the string s to lowercase. We then wrap this with the set constructor to get unique values.
For each char, we iterate through the string and check to see if the previous letter is equal to char. If so, we store this in a list. Finally, we pass this list into the collections.Counter constructor which will count the occurrences.
Each counter is stored in a dictionary, counts, where the keys are the unique characters in the string.
from collections import Counter
counts = {}
s_lower = s.lower()
for char in set(s_lower):
counts[char] = Counter(
[c for i, c in enumerate(s_lower) if i > 0 and s_lower[i-1] == char]
)
For your string, this has the following outputs:
>>> print(counts['s'])
#Counter({'i': 1, 'e': 1, 'o': 1})
>>> print(counts['o'])
#Counter({' ': 2, 'd': 1, 'n': 1, 'u': 1})
One caveat is that this method will iterate through the whole string for each unique character, which could potentially make it slow for large lists.
Here is an alternative approach using collections.Counter and collections.defaultdict that only loops through the string once:
from collections import defaultdict, Counter
def count_letters(s):
s_lower = s.lower()
counts = defaultdict(Counter)
for i in range(len(s_lower) - 1):
curr_char = s_lower[i]
next_char = s_lower[i+1]
counts[curr_char].update(next_char)
return counts
counts = count_letters(s)
We loop over each character in the string (except the last) and on each iteration we update a counter using the next character.
This should work, the only thing is it doesn't sort the values, but that can be solved by creating a new dictionary with list of tuples (char, occurrences) and using sorted function on tuple[1].
def countNext(word):
d = {}
word = word.lower()
for i in range(len(word) - 1):
c = word[i]
cc = word[i+1]
if(not c.isalpha() or not cc.isalpha()):
continue
if c in d:
if cc in d[c]:
d[c][cc] += 1
else:
d[c][cc] = 1
else:
d[c] = {}
d[c][cc] = 1
return d