Sentence count that start with a specific word - python

I am trying to print a count of sentences that start with a specific word. (When, I, From as specified below). I am not getting the right sentence count.
Here is what I've got so far:
import re
import math
def matching_sentences(file_name, word):
'''
Test:
>>> matching_sentences('bike.txt', 'When')
3 sentences begin with When
>>> matching_sentences('bike.txt', 'I')
1 sentences begin with I
>>> matching_sentences('bike.txt', 'From')
0 sentences begin with From
>>>
'''
count = 0
with open('bike.txt', 'r') as f:
for line in f:
if word in line.split():
count =+ 1
print (count," sentences begin with ", word)
# These lines are part of a larger file.
def show_file(file_name):
with open(file_name, 'r') as result_file:
print(result_file.read())
if __name__ == "__main__":
import doctest
doctest.testmod(verbose = True)
Sample text file (bike.txt):
When I was little I had a bike. I went everywhere on that bike! When I
got too big for it, I gave it to my little brother. When he out grew
it, he gave it to our sister. What did she do with it? She lost it!

You can read the whole content in one go and use re.split(...) to split it into sentences and use .strip() to cleanup any whitespace chars at the ends. Then just use str.startswith(..) to check if the sentence starts with the word or not
Here is a code to get you started:
import math
import re
def matching_sentences(file_name, word):
'''
Test:
>>> matching_sentences('bike.txt', 'When')
3 sentences begin with When
>>> matching_sentences('bike.txt', 'I')
1 sentences begin with I
>>> matching_sentences('bike.txt', 'From')
0 sentences begin with From
>>>
'''
count = 0
with open(file_name, 'r') as f:
content = f.read().lower()
for sentence in re.split('\.|\?|!', content):
if sentence.strip().startswith(word.lower()):
count += 1
print(count, "sentences begin with", word)
# These lines are part of a larger file.
def show_file(file_name):
with open(file_name, 'r') as result_file:
print(result_file.read())
if __name__ == "__main__":
import doctest
doctest.testmod(verbose=True)

def matching_sentences(file_name: str, word: str):
file = Path(file_name)
text = file.read_text()
return sum([1 if line.split(" ")[0] == word else 0 for line in text.split("\n")])
print(matching_sentences("bike.txt", "When"))
## output: 3

Related

Can't figure out how to run this Python script

Can someone explain to me (i'm not competent in programming) how to use correctly this script (link: https://github.com/dumbmatter/find-repeated-words)?. Basically it should work by taking a text file as input and outputing an HTML file with words that are repeatedly used close together highlighted, But when I run it (I installed Pyzo) I got the message: "SyntaxError: invalid syntax". I have no idea what is Python talking about, i can just assume the problem concerns the input file.
CODE:
#!/usr/bin/env python
import sys from string import punctuation from operator import itemgetter
# Check command line inputs if len(sys.argv) == 1:
print 'Pass the input text file as the first argument.'
sys.exit() elif len(sys.argv) == 2:
infile = sys.argv[1]
outfile = '%s.html' % (infile.split('.')[0],) else:
infile = sys.argv[1]
outfile = sys.argv[2]
print infile, outfile
N = 10 words = {} # Dict of word frequencies pos = {} # Dict of word positions scores = [] # List of word repeatedness scores articles = ['the', 'a', 'of', 'and', 'in', 'et', 'al'] # Common articles to ignore
# Build lists
words_gen = (word.strip(punctuation).lower() for line in open(infile)
for word in line.split())
i = 0 for word in words_gen:
words[word] = words.get(word, 0) + 1
# Build a list of word positions
if words[word] == 1:
pos[word] = [i]
else:
pos[word].append(i)
i += 1
# Calculate scores
words_gen = (word.strip(punctuation).lower() for line in open(infile)
for word in line.split())
i = 0 for word in words_gen:
scores.append(0)
# scores[i] = -1 + sum([pow(2, -abs(d-i)) for d in pos[word]]) # The -1 accounts for the 2^0 for self words
if word not in articles and len(word) > 2:
for d in pos[word]:
if d != i and abs(d-i) < 50:
scores[i] += 1.0/abs(d-i)
i += 1
scores = [score*1.0/max(scores) for score in scores] # Scale from 0 to 1
# Write colored output
f = open(outfile, 'w'); i = 0 for line in open(infile):
for word in line.split():
f.write('<span style="background: rgb(%i, 255, 255)">%s</span> ' % ((1-scores[i])*255, word))
i += 1
f.write('<br /><br />') f.close()
print 'Output saved to %s' % (outfile,)
Python is very sensitive to the formatting of the code, you cannot break or indent lines at the places python does not expect it. Just looking at the first lines:
import sys from string import punctuation from operator import itemgetter
should be split into 3 lines:
import sys
from string import punctuation
from operator import itemgetter
There are more errors like this in the code you pasted. I have downloaded the original code from the link, and it works fine.

How to take out punctuation from string and find a count of words of a certain length?

I am opening trying to create a function that opens a .txt file and counts the words that have the same length as the number specified by the user.
The .txt file is:
This is a random text document. How many words have a length of one?
How many words have the length three? We have the power to figure it out!
Is a function capable of doing this?
I'm able to open and read the file, but I am unable to exclude punctuation and find the length of each word.
def samplePractice(number):
fin = open('sample.txt', 'r')
lstLines = fin.readlines()
fin.close
count = 0
for words in lstLines:
words = words.split()
for i in words:
if len(i) == number:
count += 1
return count
You can try using the replace() on the string and pass in the desired punctuation and replace it with an empty string("").
It would look something like this:
puncstr = "Hello!"
nopuncstr = puncstr.replace(".", "").replace("?", "").replace("!", "")
I have written a sample code to remove punctuations and to count the number of words. Modify according to your requirement.
import re
fin = """This is a random text document. How many words have a length of one? How many words have the length three? We have the power to figure it out! Is a function capable of doing this?"""
fin = re.sub(r'[^\w\s]','',fin)
print(len(fin.split()))
The above code prints the number of words. Hope this helps!!
instead of cascading replace() just use strip() a one time call
Edit: a cleaner version
pl = '?!."\'' # punctuation list
def samplePractice(number):
with open('sample.txt', 'r') as fin:
words = fin.read().split()
# clean words
words = [w.strip(pl) for w in words]
count = 0
for word in words:
if len(word) == number:
print(word, end=', ')
count += 1
return count
result = samplePractice(4)
print('\nResult:', result)
output:
This, text, many, have, many, have, have, this,
Result: 8
your code is almost ok, it just the second for block in wrong position
pl = '?!."\'' # punctuation list
def samplePractice(number):
fin = open('sample.txt', 'r')
lstLines = fin.readlines()
fin.close
count = 0
for words in lstLines:
words = words.split()
for i in words:
i = i.strip(pl) # clean the word by strip
if len(i) == number:
count += 1
return count
result = samplePractice(4)
print(result)
output:
8

word counter || python

I want to print the number of words in a txt file having 1-20 letter.
Tried this but it prints 20 zeroes instead. any idea?
edit - in the end the program should plot 20 numbers, each one is the number of words in the file containing 1-20 letters.
fin = open('words.txt')
for i in range(20):
counter = 0
for line in fin:
word = line.strip()
if len(word) == i:
counter = counter + 1
print counter,
EDIT
To produce individual counts for each word length you can use a collections.Counter:
from collections import Counter
def word_lengths(f):
for line in f:
for word in line.split(): # does not ignore punctuation
yield len(word)
with open('words.txt') as fin:
counts = Counter(length for length in word_lengths(fin) if length <= 20)
This uses a generator to read the file and produce a sequence of word lengths. The filtered word lengths are fed into a Counter. You could perform the length filtering on the Counter instead.
If you want to ignore punctuation you could look at using str.translate() to remove unwanted characters, or possibly re.split(r'\W+', line) instead of line.split().
Try it like this:
with open('words.txt') as fin:
counter = 0
for line in fin:
for word in line.split():
if len(word) <= 20:
counter = counter + 1
print counter,
This could be simplified to:
with open('words.txt') as fin:
counter = sum([1 for line in fin
for word in line.split() if len(word) <= 20])
but that's playing code golf.
You can also use a collections.Counter if it is practical to read the entire file into memory:
from collections import Counter
with open('words.txt') as fin:
c = Counter(fin.read().split())
counter = sum(c[k] for k in c if len(k) <= 20)
And no doubt there are many other ways to do it. None of the above expect or handle punctuation.
It should be like this,counter shouldn't be in for loop,and you could use len() method to get the length of words:
with open("test") as f:
counter = 0
for line in f:
for word in line.split():
if len(word)<=20:
counter+=1
print counter
Or my way:
import re
with open("file") as f:
print len(filter(lambda x:len(x)<20,re.split('\n| ', f.read())))
Hope this helps.
using regular expressions
import re
REGEX = r"(\b\S{1,20}\b)"
finder = re.compile(REGEX)
with open("words.txt") as out:
data = out.read()
matches = re.findall(finder, data)
lst = [0 for _ in range(20)]
for m in matches:
lst[len(m)] += 1
print(lst)

MapReduce to count the frequency of the number consonants in words from a text file

I need a bit of help with Python code to count the frequency of consonants in a word. Consider the following sample input:
"There is no new thing under the sun."
Then the required output would be:
1 : 2
2 : 3
3 : 2
4 : 1
as there are 2 words with 1 consonant, 3 words with 2 consonants, 2 words with 3 consonants and 1 word with 4 consonants.
The following code does a similar job but instead of consonants it counts the frequency of whole words in text file. I know there is only a bit change which loops deeper into the word (I think).
def freqCounter(file1, file2):
freq_dict = {}
dict_static = {2:0, 3:0, 5:0}
# get rid of punctuation
punctuation = re.compile(r'[.?!,"\':;]') # use re.compile() function to convert string into a RegexObject.
try:
with open(file1, "r") as infile, open(file2, "r") as infile2: # open two files at once
text1 = infile.read() # read the file
text2 = infile2.read()
joined = " ".join((text1, text2))
for word in joined.lower().split():
#remove punctuation mark
word = punctuation.sub("", word)
#print word
l = len(word) # assign l tp be the word's length
# if corresponding word's length not found in dict
if l not in freq_dict:
freq_dict[l] = 0 # assign the dict key (the length of word) to value = 0
freq_dict[l] += 1 # otherwise, increase the value by 1
except IOError as e: # exception catch for error while reading the file
print 'Operation failed: %s' % e.strerror
return freq_dict # return the dictionary
Any help will be much appreciated!
I would try a simpler approach:
from collections import Counter
words = 'There is no new thing under the sun.'
words = words.replace('a', '').replace('e', '').replace('i', '').replace('o', '').replace('u', '') # you are welcome to replace this with a smart regex
# Now words have no more vowels i.e. only consonants
word_lengths = map(len, words.split(' '))
c = Counter(word_lengths)
freq_dict = dict(Counter(c))
A simple solution
def freqCounter(_str):
_txt=_str.split()
freq_dict={}
for word in _txt:
c=0
for letter in word:
if letter not in "aeiou.,:;!?[]\"`()'":
c+=1
freq_dict[c]=freq_dict.get(c,0)+ 1
return freq_dict
txt = "There is no new thing under the sun."
table=freqCounter(txt)
for k in table:
print( k, ":", table[k])
How about this?
with open('conts.txt', 'w') as fh:
fh.write('oh my god becky look at her butt it is soooo big')
consonants = "bcdfghjklmnpqrstvwxyz"
def count_cons(_file):
results = {}
with open(_file, 'r') as fh:
for line in fh:
for word in line.split(' '):
conts = sum([1 if letter in consonants else 0 for letter in word])
if conts in results:
results[conts] += 1
else:
results[conts] = 1
return results
print count_cons('conts.txt')
Missed the results
{1: 5, 2: 5, 3: 1, 4: 1}
[Finished in 0.0s]

Need to open text file, print random word with over 5 characters python

import random
dictionary = open('word_list.txt', 'r')
for line in dictionary:
for i in range(0, len(line)):
if i >= 5:
word = random.choice(line)
dictionary.close()
this code doesnt seem to work for me
here is a link to the file if it helps
http://vlm1.uta.edu/~athitsos/courses/cse1310_summer2013/assignments/assignment8/word_list.txt
import random
with open('word_list.txt', 'r') as f:
words = [word.rstrip() for word in f if len(word) > 5]
print random.choice(words)
As #ashwini-chaudhary correctly pointed out, word on each step of iteration has newline \n at the end - that's why you need to use rstrip().
Assuming each word is on it's own line such as:
word
word2
word3
...
then you can do this:
from random import choice
with open("word_list.txt") as file:
print choice([line.rstrip() for line in file if len(line) > 5])

Categories