Average word length - punctuation (",", ".", ";") and newline characters {"\n"} - python

I want to write a Python script that will read a test from the console and output the average number of characters per word,but i have some problem with punctuation and newline characters .
there is my code.
def main():
allwords = []
while True:
words = input()
if words == "Amen.":
break
allwords.extend(words.split())
txt = " ".join( allwords)
n_pismen = len([c for c in txt if c.isalpha()])
n_slov = len([i for i in range(len(txt) - 1) if txt[i].isalpha() and not txt[i + 1].isalpha()])
for char in (',', '.', ';'):
txt = txt.replace(char, '')
txt.replace('\n', ' ')
words = txt.split()
print(sum(len(word) for word in words) / len(words))
if words:
average = sum(len(words) for words in allwords) / len( allwords)
if __name__ == '__main__':
main()
Our Father, which art in heaven,
hallowed be thy name;
thy kingdom come;
thy will be done,
in earth as it is in heaven.
Give us this day our daily bread.
And forgive us our trespasses,
as we forgive them that trespass against us.
And lead us not into temptation,
but deliver us from evil.
For thine is the kingdom,
the power, and the glory,
For ever and ever.
Amen.
normal will be output 4.00,but i just get 1.00

Not sure what's wrong in your example, but this will work. I used "test" as the string name, you can modify that as desired:
counts = [] #List to store number of characters per word
for t in test.split(): #Split to substrings at whitespace
counts.append(len([c for c in t if c.isalpha()])) #Calculate the length of each word ignoring non-letters
print(sum(counts)/len(counts)) #Compute the average

You can do this as follows (where strng is the passage of text):
# Remove all of the 'bad' characters
for char in (',', '.', ';'):
strng = strng.replace(char, '')
strng.replace('\n', ' ')
# Split on spaces
words = strng.split()
# Calculate the average length
print(sum(len(word) for word in words) / len(words))

I would match every word using a regex, than keep track on # of words and # of total characters:
import re
total_number = 0
n_words = 0
pattern = re.compile("[a-z]+", re.IGNORECASE)
with open({PATH_TO_YOUR_FILE}, "r") as f:
for line in f:
words = pattern.findall(line)
n_words += len(words)
total_number += sum([len(x) for x in words])
print(total_number/n_words)
OUTPUT
4.0

Try this:
import string
s = input()
s = s.translate(str.maketrans('', '', string.punctuation)
s.replace('\n', ' ')
words = s.split()
print(sum(map(len, words)) / len(words))

Related

Getting the maximum number of words in a sentence of a paragraph Python

I'm trying to get the maximum numbers of words inside a sentence of a paragraph but just can't see it.
Here is what I tried:
S = input("Enter a paragraph")
def getMaxNum(S):
if "." in S:
new_list = S.split(".")[0]
return len(new_list)
else "?" in S:
new_list = S.split("?")[0]
return len(new_list)
else "!" in S:
new_list = S.split("?")[0]
return len(new_list)
getMaxNum(S)
In the else statement I could be getting the previous sentence values but that's not what I need to get. Any ideas how can I accomplish that?
I'm not 100% certain of what your requirements are, but if I borrow Buoy Rina's input, here's a solution using regular expressions (pattern search strings):
#!/usr/bin/env python3
import re
text = "I will go school tomorrow. I eat apples. Here is a six word sentence."
max_words = 0
sentences = re.split("[.!?]", text)
for sentence in sentences:
max_words = max( len( sentence.split() ), max_words )
print(f"max_words: {max_words}")
The re.split() breaks the text (or paragraph) into sentences based on "some" end of sentence punctuation. There are likely conditions under which searching for period '.' won't yield a complete sentence, but we'll ignore that for simplicity.
The string function split() then breaks up the sentence into words based on white space (the default of split()). We then get the length of the resultant list to find the word count.
text = "I will go school tomorrow. I eat apples. I will have a very long sentence. "
def getmaxwordcount(text):
count_word = 0
is_start_word = False
counts = []
for c in text:
if c == ' ':
if is_start_word:
count_word += 1
is_start_word = False
elif c == '!' or c == '.' or c == '?':
if is_start_word:
count_word += 1
is_start_word = False
counts.append(count_word)
count_word = 0
else:
if c.isalpha():
if is_start_word == False:
is_start_word = True
return max(counts)
getmaxwordcount(text) # 7
import re
text = "I will go school tomorrow. I eat apples."
def foo(txt):
max_count=0
for i in re.split('[!.?]',txt):
if len(i.split()) > max_count:
max_count = len(i.split())
return max_count
print(foo(text)) # returns 5
code
import re
paragraph = "Two words. Three other words? Finally four another words!"
all_lengths_in_paragraph = [f"Length of {n+1}th sentence is {len(list(filter(None, x.split(' '))))}" for n, x in enumerate(list(filter(None, re.split('\.|!|\?', paragraph))))]
max_length = max([len(list(filter(None, x.split(' ')))) for x in list(filter(None, re.split('\.|!|\?', paragraph)))])
for one_length in all_lengths_in_paragraph:
print(r)
print('maximum length is', max_length)
output
Length of 1th sentence is 2
Length of 2th sentence is 3
Length of 3th sentence is 4
maximum length is 4

Shuffle words' characters while maintaining sentence structure and punctuations

So, I want to be able to scramble words in a sentence, but:
Word order in the sentence(s) is left the same.
If the word started with a capital letter, the jumbled word must also start with a capital letter
(i.e., the first letter gets capitalised).
Punctuation marks . , ; ! and ? need to be preserved.
For instance, for the sentence "Tom and I watched Star Wars in the cinema, it was
fun!" a jumbled version would be "Mto nad I wachtde Tars Rswa ni het amecin, ti wsa
fnu!".
from random import shuffle
def shuffle_word(word):
word = list(word)
if word.title():
???? #then keep first capital letter in same position in word?
elif char == '!' or '.' or ',' or '?':
???? #then keep their position?
else:
shuffle(word)
return''.join(word)
L = input('try enter a sentence:').split()
print([shuffle_word(word) for word in L])
I am ok for understanding how to jumble each word in the sentence but... struggling with the if statement to apply specifics? please help!
Here is my code. Little different from your logic. Feel free to optimize the code.
import random
def shuffle_word(words):
words_new = words.split(" ")
out=''
for word in words_new:
l = list(word)
if word.istitle():
result = ''.join(random.sample(word, len(word)))
out = out + ' ' + result.title()
elif any(i in word for i in ('!','.',',')):
result = ''.join(random.sample(word[:-1], len(word)-1))
out = out + ' ' + result+word[-1]
else:
result = ''.join(random.sample(word, len(word)))
out = out +' ' + result
return (out[1:])
L = "Tom and I watched Star Wars in the cinema, it was fun!"
print(shuffle_word(L))
Output of above code execution:
Mto nda I whaecdt Atsr Swra in hte ienamc, ti wsa nfu!
Hope it helps. Cheers!
Glad to see you've figured out most of the logic.
To maintain the capitalization of the first letter, you can check it beforehand and capitalize the "new" first letter later.
first_letter_is_cap = word[0].isupper()
shuffle(word)
if first_letter_is_cap:
# Re-capitalize first letter
word[0] = word[0].upper()
To maintain the position of a trailing punctuation, strip it first and add it back afterwards:
last_char = word[-1]
if last_char in ".,;!?":
# Strip the punctuation
word = word[:-1]
shuffle(word)
if last_char in ".,;!?":
# Add it back
word.append(last_char)
Since this is a string processing algorithm I would consider using regular expressions. Regex gives you more flexibility, cleaner code and you can get rid of the conditions for edge cases. For example this code handles apostrophes, numbers, quote marks and special phrases like date and time, without any additional code and you can control these just by changing the pattern of regular expression.
from random import shuffle
import re
# Characters considered part of words
pattern = r"[A-Za-z']+"
# shuffle and lowercase word characters
def shuffle_word(word):
w = list(word)
shuffle(w)
return ''.join(w).lower()
# fucntion to shuffle word used in replace
def replace_func(match):
return shuffle_word(match.group())
def shuffle_str(str):
# replace words with their shuffled version
shuffled_str = re.sub(pattern, replace_func, str)
# find original uppercase letters
uppercase_letters = re.finditer(r"[A-Z]", str)
# make new characters in uppercase positions uppercase
char_list = list(shuffled_str)
for match in uppercase_letters:
uppercase_index = match.start()
char_list[uppercase_index] = char_list[uppercase_index].upper()
return ''.join(char_list)
print(shuffle_str('''Tom and I watched "Star Wars" in the cinema's new 3D theater yesterday at 8:00pm, it was fun!'''))
This works with any sentence, even if was "special" characters in a row, preserving all the punctuaction marks:
from random import sample
def shuffle_word(sentence):
new_sentence=""
word=""
for i,char in enumerate(sentence+' '):
if char.isalpha():
word+=char
else:
if word:
if len(word)==1:
new_sentence+=word
else:
new_word=''.join(sample(word,len(word)))
if word==word.title():
new_sentence+=new_word.title()
else:
new_sentence+=new_word
word=""
new_sentence+=char
return new_sentence
text="Tom and I watched Star Wars in the cinema, it was... fun!"
print(shuffle_word(text))
Output:
Mto nda I hctawed Rast Aswr in the animec, ti asw... fnu!

Python code to find the total number of shortest words in a string.

I am working on an assignment and I am trying to make a code for the below mentioned problem.
Write a python script that reads a types text, analyzes how many words the text consists of, and prints the total number of words as well as the number of "short" words of just three letter or less.
The given string is: "The play 's the thing wherein I'll catch the conscience of the king."
There is a small trick in the question. One cannot use the split() function because it will consider "I'll" as one word but the assignment requires us to consider it as two different words and hence giving an output that shows that the string has 14 words.
When it comes to the "short words". It should again consider "I'll" as two separate short words and it should give an output that shows that the string has 8 short words i.e. ["The", "s", "the", "I", "ll", "the", "of", "the"].
Thanks a lot and I would love it if you could share a code for this problem.
string= input("Enter string:")
word=1
y = 0
char = 0
for i in string:
if(i == ' ' or i == "'"):
word = word+1
for x in i:
if len(x) <= 3:
y = y+1
print("Number of words in the string:")
print(word)
print (y)
This is my code and the output is below:
Number of words in the string:
16
69
You can use re.split() to split on multiple delimiters:
import re
s = "The play 's the thing wherein I'll catch the conscience of the king"
lst = re.split(r"'| ", s)
all_words_lst = [x for x in lst if x]
print(f'Total words count: {len(all_words_lst)}')
short_words_lst = [x for x in lst if x and len(x) <= 3]
print(f'Total short words count: {len(short_words_lst)}')
# Total words count: 14
# Total short words count: 8
You can first replace "'" with " " and then later call split on resultant string.
>>> s = "The play's the thing wherein I'll catch the conscience of the king."
>>> s = s.replace("'", " ")
>>> s = s.split()
>>> len(s)
14
>>> s
['The', 'play', 's', 'the', 'thing', 'wherein', 'I', 'll', 'catch', 'the', 'conscience', 'of', 'the', 'king.']
x = "The play 's the thing wherein I'll catch the conscience of the king."
x = x.replace("'", " ")
x = x.split()
# Find all the words that have length less than 3
ans = [i for i in x if len(i) <= 3]
print("Total number of words {}, short words{}".format(len(x), len(ans)))
You can change all characters ' to space. Then split() without any arguments returns the list of all words.
string= input("Enter string:")
word=0
y = 0
for i in range(len(string)):
if string[i] == "\'":
string[i] = ' '
for i in string.split():
word += 1
if len(i) <= 3:
y += 1
print("Number of words in the string:")
print(word)
print (y)
With re.split function:
import re
input_string = input("Enter string:") # for ex. "He is a good-hearted person, too"
words = re.findall(r"\w+(?:-\w+)?", input_string)
print("Total number of words:", len(words))
print("Number of 'short' words:", len([w for w in words if len(w) <= 3]))
The output:
Total number of words: 6
Number of 'short' words: 4

Regex in python search

I'm trying to make something in python where you put the amount of letters in a word then it searches in a word list for words with that amount of chars.
My code:
import sys
import re
def search(pattern):
print("Searching...\n\n")
for i, line in enumerate(open(sys.argv[1])):
for match in re.finditer(pattern, line):
print(match.groups())
print("\n\nFinished.")
while True:
word = ""
put = int(input("Amount of Letters in Word: "))
if put > 25 or put < 3:
print("Invalid amount of letters.")
else:
for n in range(0, put):
word = word + "."
word = "^" + word + "$"
pattern = re.compile(word)
search(pattern)
I want it to show all words with the amount of letters that you put.
https://i.imgur.com/Kgusvyh.png
List of words:
word
1234
okay
0000
asdfg
asdfh
asdgj
Why does it show ()?
Fixed by replacing match.groups() with match.group().

Python -Remove capitalized words from long string

I have long string (28MB) of normal sentences. I want to remove all words what are fully in capital letters (like TNT, USA, OMG).
So from sentance:
Jump over TNT in There.
I would like to get:
Jump over in There.
Is there any way, how to do it without splitting the text into list and itereate? Is it possible to use regex somehow to do is?
You can use the set of capital letters [A-Z] captured with word boundary \b:
import re
line = 'Jump over TNT in There NOW'
m = re.sub(r'\b[A-Z]+\b', '', line)
#'Jump over in There '
Use the module re,
import re
line = 'Jump over TNT in There.'
new_line = re.sub(r'[A-Z]+(?![a-z])', '', line)
print(new_line)
# Output
Jump over in There.
I would do something like this:
import string
def onlyUpper(word):
for c in word:
if not c.isupper():
return False
return True
s = "Jump over TNT in There."
for char in string.punctuation:
s = s.replace(char, ' ')
words = s.split()
good_words = []
for w in words:
if not onlyUpper(w):
good_words.append(w)
result = ""
for w in good_words:
result = result + w + " "
print result

Categories