Most Common letter in a string - python

Completing an exercise to find the most common letter in a string, excluding punctuation and the result should be in lowercase. So in the example "HHHHello World!!!!!!!!!!" the result should be "h".
What I have so far is:
text=input('Insert String: ')
def mwl(text):
import string
import collections
for p in text:
p.lower()
for l in string.punctuation:
for x in text:
if x==l:
text.replace(x,'')
collist=collections.Counter(text).most_common(1)
print(collist[0][0])
mwl(text)
I would appreciate your help to understand why:
The case is not remaining changed to lower in text
The punctuation is not being permanently removed from the text string

There are several issues:
Strings are immutable. This means that functions like lower() and replace() return the results and leave the original string as is. You need to assign that return value somewhere.
lower() can operate on the entire string: text = text.lower().
For some ideas on how to remove punctuation characters from a string, see Best way to strip punctuation from a string in Python

you can try this:
>>> import re
>>> from collections import Counter
>>> my_string = "HHHHello World!!!!!!!!!!"
>>> Counter("".join(re.findall("[a-z]+",my_string.lower()))).most_common(1)
[('h', 4)]

text = input('Insert String: ')
from string import punctuation
from collections import Counter
def mwl(text):
st = set(punctuation)
# remove all punctuation and make every letter lowercase
filtered = (ch.lower() for ch in text if ch not in st)
# make counter dict from remaining letters and return the most common
return Counter(filtered).most_common()[0][0]
Or use str.translate to remove the punctuation :
from string import punctuation
from collections import Counter
def mwl(text):
text = text.lower().translate(str.maketrans(" "*len(punctuation),punctuation))
return Counter(text).most_common()[0][0]
Using your own code you need to reassign text to the updated string:
def mwl(text):
import string
import collections
text = text.lower()
for l in string.punctuation:
for x in text:
if x == l:
text = text.replace(x,'')
collist=collections.Counter(text).most_common(1)
print(collist[0][0])
Also instead of looping over the text in your code you could just use in:
for l in string.punctuation:
if l in text:
text = text.replace(l,'')

First big issues is you never actually assign anything.
p.lower()
just returns a lowercase version of p. It does not set p to the lowercase version. Should be
p = p.lower()
Same with the text.replace(x,''). It should be
text = text.replace(x,'')

You could do:
>>> from collections import Counter
>>> from string import ascii_letters
>>> tgt="HHHHello World!!!!!!!!!!"
>>> Counter(c.lower() for c in tgt if c in ascii_letters).most_common(1)
[('h', 4)]

If input is ascii-only then you could use bytes.translate() to convert it to lowercase and remove punctuation:
#!/usr/bin/env python3
from string import ascii_uppercase, ascii_lowercase, punctuation
table = b''.maketrans(ascii_uppercase.encode(), ascii_lowercase.encode())
def normalize_ascii(text, todelete=punctuation.encode()):
return text.encode('ascii', 'strict').translate(table, todelete)
s = "HHHHello World!!!!!!!!!!"
count = [0]*256 # number of all possible bytes
for b in normalize_ascii(s): count[b] += 1 # count bytes
# print the most common byte
print(chr(max(range(len(count)), key=count.__getitem__)))
If you want to count letters in a non-ascii Unicode text then you could use .casefold() method (proper caseless comparison) and remove_punctuation() function:
#!/usr/bin/env python3
from collections import Counter
import regex # $ pip install regex
def remove_punctuation(text):
return regex.sub(r"\p{P}+", "", text)
s = "HHHHello World!!!!!!!!!!"
no_punct = remove_punctuation(s)
characters = (c.casefold() for c in regex.findall(r'\X', no_punct))
print(Counter(characters).most_common(1)[0][0])
r'\X' regex is used to count user-perceived characters instead of mere Unicode codepoints.

Related

To print the count of occurrences of a word ending with "on" in python

my_string = """Strings are gameon amongst gameon the most popular data types in Python. We can create the strings by enclosing characters briton in quotes. Python treats briton single quotes the same as double quotes."""
def count_words(string):
for word in string.split():
if word.endswith("on") == True:
print(word,":",string.count(word))
string = string.replace(word,'')
count_words(my_string)
I want to print all the words and their occurences in a word if they end with "on". I am getting something like
gameon : 2
gameon : 0
briton : 2
Python : 2
briton : 0
this even after removing the word.
Why it is repeating?
Edit: I can't use any module. Only logic.
You do not need to modify your string while you are counting.
Instead, you can use collections.Counter with a generator expression. It's also worth, as below, converting to lowercase and removing punctuation.
from collections import Counter
from string import punctuation
table = str.maketrans(punctuation, ' ' * len(punctuation))
x = my_string.translate(table).lower()
c = Counter(i for i in x.split() if i.endswith('on'))
print(c)
Counter({'gameon': 2, 'python': 2, 'briton': 2})
my_string = """Strings are gameon amongst gameon the most popular data types in Python. We can create the strings by enclosing characters briton in quotes. Python treats briton single quotes the same as double quotes."""
di={}
def count_words(string):
for word in string.split():
if word.endswith("on") == True:
if word in di:
di[word]+=1
else:
di[word]=1
string = string.replace(word,'')
#print(string)
count_words(my_string)
for i in di:
print(i,di[i])
You can use a dictionary to achieve the same.
Using collections.Counter
Ex:
import collections
my_string = """Strings are gameon amongst gameon the most popular data types in Python. We can create the strings by enclosing characters briton in quotes. Python treats briton single quotes the same as double quotes."""
def count_words(string):
for word, v in collections.Counter(string.split()).items():
if word.endswith("on"):
print(word,":",v)
count_words(my_string)
Output:
('Python', ':', 1)
('briton', ':', 2)
('gameon', ':', 2)
You can use pandas.Series to value_counts() these words
from string import punctuation
my_string = ''.join(w for w in my_string if w not in set(punctuation))
pd.Series([i for i in my_string.split(" ") if i.endswith("on")]).value_counts()
>> (gameon, 2), (briton, 2), (Python, 2)

Remove characters in a string without multiple calls to str.replace

I'm doing this project for my class and I was just wondering if would it be possible if I could replace a list of vowels, Uppercase and Lowercase in just one line instead of how I have it. This is in Python.
I would like it to be a bit more simple then writing this out completely
Thanks
s= input ('Enter a Sentence: ')
s = str(s.replace ('a',''))
s = str(s.replace ('e',''))
s = str(s.replace ('i',''))
s = str(s.replace ('o',''))
s = str(s.replace ('u',''))
s = str(s.replace ('A',''))
s = str(s.replace ('E',''))
s = str(s.replace ('I',''))
s = str(s.replace ('O',''))
s = str(s.replace ('U',''))
print (s)
You can use str.translate and a dict comprehension:
>>> 'aeiouAEIOU'.translate({ord(x):None for x in 'aeiouAEIOU'})
''
>>>
The dict comprehension is used to create a mapping for str.translate of what characters should be translated into what. Mapping characters to None causes the method to remove them.
Note that you could also use str.maketrans instead of the dict comprehension:
>>> 'aeiouAEIOU'.translate(str.maketrans('', '', 'aeiouAEIOU'))
''
>>>
you can use re module
import re
s= input('Enter a Sentence: ')
re.sub('[AEIOUaeiou]','',s)

Persistent index in python string

I'm trying to get string.index() to ignore instances of a character that it has already located within a string. Here is my best attempt:
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
def save_alphabet(phrase):
saved_alphabet = ""
for item in phrase:
if item in alphabet:
saved_alphabet = saved_alphabet + str(phrase.index(item))
return saved_alphabet
print save_alphabet("aAaEaaUA")
The output I'd like is "1367" but, as it only finds the first instance of item it is outputting "1361".
What's the best way to do this? The returned value should be in string format.
>>> from string import ascii_uppercase as alphabet
>>> "".join([str(i) for i, c in enumerate("aAaEaaUA") if c in alphabet])
'1367'
regex solution (do not prefer regex in this case)
>>> import re
>>> "".join([str(m.start()) for m in re.finditer(r'[A-Z]', "aAaEaaUA")])
'1367'

python string manipulation

I have a string s with nested brackets: s = "AX(p>q)&E((-p)Ur)"
I want to remove all characters between all pairs of brackets and store in a new string like this: new_string = AX&E
i tried doing this:
p = re.compile("\(.*?\)", re.DOTALL)
new_string = p.sub("", s)
It gives output: AX&EUr)
Is there any way to correct this, rather than iterating each element in the string?
Another simple option is removing the innermost parentheses at every stage, until there are no more parentheses:
p = re.compile("\([^()]*\)")
count = 1
while count:
s, count = p.subn("", s)
Working example: http://ideone.com/WicDK
You can just use string manipulation without regular expression
>>> s = "AX(p>q)&E(qUr)"
>>> [ i.split("(")[0] for i in s.split(")") ]
['AX', '&E', '']
I leave it to you to join the strings up.
>>> import re
>>> s = "AX(p>q)&E(qUr)"
>>> re.compile("""\([^\)]*\)""").sub('', s)
'AX&E'
Yeah, it should be:
>>> import re
>>> s = "AX(p>q)&E(qUr)"
>>> p = re.compile("\(.*?\)", re.DOTALL)
>>> new_string = p.sub("", s)
>>> new_string
'AX&E'
Nested brackets (or tags, ...) are something that are not possible to handle in a general way using regex. See http://www.amazon.de/Mastering-Regular-Expressions-Jeffrey-Friedl/dp/0596528124/ref=sr_1_1?ie=UTF8&s=gateway&qid=1304230523&sr=8-1-spell for details why. You would need a real parser.
It's possible to construct a regex which can handle two levels of nesting, but they are already ugly, three levels will already be quite long. And you don't want to think about four levels. ;-)
You can use PyParsing to parse the string:
from pyparsing import nestedExpr
import sys
s = "AX(p>q)&E((-p)Ur)"
expr = nestedExpr('(', ')')
result = expr.parseString('(' + s + ')').asList()[0]
s = ''.join(filter(lambda x: isinstance(x, str), result))
print(s)
Most code is from: How can a recursive regexp be implemented in python?
You could use re.subn():
import re
s = 'AX(p>q)&E((-p)Ur)'
while True:
s, n = re.subn(r'\([^)(]*\)', '', s)
if n == 0:
break
print(s)
Output
AX&E
this is just how you do it:
# strings
# double and single quotes use in Python
"hey there! welcome to CIP"
'hey there! welcome to CIP'
"you'll understand python"
'i said, "python is awesome!"'
'i can\'t live without python'
# use of 'r' before string
print(r"\new code", "\n")
first = "code in"
last = "python"
first + last #concatenation
# slicing of strings
user = "code in python!"
print(user)
print(user[5]) # print an element
print(user[-3]) # print an element from rear end
print(user[2:6]) # slicing the string
print(user[:6])
print(user[2:])
print(len(user)) # length of the string
print(user.upper()) # convert to uppercase
print(user.lstrip())
print(user.rstrip())
print(max(user)) # max alphabet from user string
print(min(user)) # min alphabet from user string
print(user.join([1,2,3,4]))
input()

Find inside a string in Python

There is a string, it contains numbers and characters.
I need to find an entire number(s) (in that string) that contains number 467033.
e.g. 1.467033777777777
Thanks
Try this:
import re
RE_NUM = re.compile('(\d*\.\d+)', re.M)
text = 'eghwodugo83o135.13508yegn1.4670337777777773u87208t'
for num in RE_NUM.findall(text):
if '467033' in num:
print num
Prints:
1.4670337777777773
Generalized / optimized in response to comment:
def find(text, numbers):
pattern = '|'.join('[\d.]*%s[\d.]*' % n for n in numbers)
re_num = re.compile(pattern, re.M)
return [m.group() for m in re_num.finditer(text)]
print find(text, ['467033', '13'])
Prints:
['135.13508', '1.4670337777777773']
If you're just searching for a substring within another substring, you can use in:
>>> sub_num = "467033"
>>> my_num = "1.467033777777777"
>>> sub_num in my_num
True
However, I suspect there's more to your problem than just searching strings, and that doing it this way might not be optimal. Can you be more specific about what you're trying to do?
import re
a = 'e.g. 1.467033777777777\nand also 576575567467033546.90 Thanks '
r = re.compile('[0-9.]*467033[0-9.]*')
r.findall(a)
['1.467033777777777', '576575567467033546.90']

Categories