How to read words with ' in them? - python

I got this code that prints the most common words of a txt file. I want it to print and count words with ' in them. How can I do this?
words = open(input('Enter the name of the file: ')).read().lower().split()
number_of_words = int(input('Enter how many top words you want to see: '))
uniques = []
stop_words = ["a", "an", "and", "in", "is", "the"]
for word in words:
check_special = False
if word.isalnum():
check_special = True
if word not in uniques and word not in stop_words and check_special:
uniques.append(word)
counts = []
for unique in uniques:
count = 0
for word in words:
if word == unique:
count += 1
counts.append((count, unique))
counts.sort()
counts.reverse()
counts_dict = {count: [] for count, word in counts}
for count, word in counts:
counts_dict[count].append(word)
count_num_word = 0
for count in counts_dict:
if count_num_word >= number_of_words:
break
print('The following words appeared %d times each: %s' % (count, ', '.join(sorted(counts_dict[count]))))
count_num_word += 1

Write your own function that checks whether every character in a string is alphanumeric or quote, and use that instead of word.isalnum().
def alnum_or_quote(s):
return all(c == "'" or c.isalnum() for c in s)
Then replace if word.isalnum(): with if alnum_or_quote(word):

Related

How to sort my output form my program without changing the order of my whole loop?

I have this code:
words = open(input('Enter the name of the file: ')).read().lower().split()
number_of_words = int(input('Enter how many top words you want to see: '))
uniques = []
stop_words = ["a", "an", "and", "in", "is"]
for word in words:
check_special = False
if word.isalnum():
check_special = True
if word not in uniques and word not in stop_words and check_special:
uniques.append(word)
counts = []
for unique in uniques:
count = 0
for word in words:
if word == unique:
count += 1
counts.append((count, unique))
counts.sort()
counts.reverse()
counts_dict = {count: [] for count, word in counts}
for count, word in counts:
counts_dict[count].append(word)
sorted_count = sorted(counts_dict)
count_num_word = 0
for count in counts_dict:
if count_num_word >= number_of_words:
break
print('The following words appeared %d times each: %s' % (count, ', '.join(counts_dict[count])))
count_num_word += 1
It prints the most common words in a txt file. It outputs this:
The following words appeared 8 times each: what, just, apple
I want the output after "each:" to be sorted in an alphabetical order without changing the order lines print. How can I do this? Thanks!
Something like this:
The following words appeared 8 times each: apple, just, what
Use sorted:
print('The following words appeared %d times each: %s' % (count, ', '.join(sorted(counts_dict[count]))))

How to find the longest word in a string without using split method

I need an algorithm that can find the longest word in a string, I can't use split(), the only predefined function I can use is find() and I don't think it's useful for this solution.
This is what I managed to do so far:
ch=input("donner: ")
def plus_long(ch):
p=ch.find(" ")
if p==-1:
return ch
maximum=""
mot=""
while p!=-1:
mot=ch[:p]
print(mot)
ch=ch[p+1:]
print(ch)
if len(mot)>len(maximum):
maximum=mot
p=ch.find(" ")
return maximum
print("maximum est: ",plus_long(ch))
But this one doesn't check the last word because there are no more spaces.
EDIT: Thank you all for the answers, i realised how to solve it this morning by putting ch in a new variable and comparing it to maximum and it worked
ch=input("donner: ")
def plus_long(ch):
p=ch.find(" ")
if p==-1:
return ch
maximum=""
mot=""
while p!=-1:
mot=ch[:p]
print(mot)
ch=ch[p+1:len(ch)]
print(ch)
if len(mot)>len(maximum):
maximum=mot
p=ch.find(" ")
f=ch
if len(f)>len(maximum):
maximum=f
return maximum
print("maximum est: ",plus_long(ch))
I've split this problem into two parts:
Use find to create the list of words.
Find the longest word in the list.
def plus_long(ch):
letters = "abcdefghijklmnopqrstuvwxyz"
words = ['']
for v in ch:
if letters.find(v.lower()) != -1: # Use find to check if the character is part of the alphabet
words[-1] += v # If so, add that character to the last string in the list
else:
words.append('') # Else, start a new string
result = "" # Check which string is the longest
for word in words:
if len(word) > len(result):
result = word
return result
Test:
>>> plus_long("Hello!!!!!!!!!!! How are you? I am exhausted.")
Output:
'exhausted'
I see that the other answers don't put punctuation into account, so that using their functions, the result would be 'Hello!!!!!!!!!!!'.
You can find using split and len:
# Longest word
# Reading sentence from user
sentence = input("Enter sentence: ")
# Finding longest word
longest = max(sentence.split(), key=len)
# Displaying longest word
print("Longest word is: ", longest)
print("And its length is: ", len(longest))
The output is:
Enter sentence: Tongue tied and twisted just an earth bound misfit I
Longest word is: twisted
And its length is: 7
Here is your code with an addition that checks the last word (len) when p == -1 just as the loop is going to exit. This addition reported the correct max length if the last word is greater than maximum.
ch=input("donner: ")
def plus_long(ch):
p=ch.find(" ")
if p==-1:
return ch
maximum=""
mot=""
while p!=-1:
mot=ch[:p]
print(mot)
ch=ch[p+1:]
print(ch)
if len(mot)>len(maximum):
maximum=mot
p=ch.find(" ")
# 'ch' now has the last word in a sentence
# it needs to be checked against 'maximum'
if p == -1:
if len(ch) > len(maximum):
maximum = ch
return maximum
print("maximum est: ",plus_long(ch))
You can set p to None if the find method returns -1 so that ch[:p] would slice the rest of the string to get the last word:
def plus_long(ch):
maximum = ""
while True:
p = ch.find(" ")
if p == -1:
p = None
mot = ch[:p]
if len(mot) > len(maximum):
maximum = mot
if p is None:
break
ch = ch[p + 1:]
return maximum
Using find
def plus_long(ch):
longest = ""
i = 0
while i < len(ch):
n = ch.find(" ", i)
if n == -1:
n = len(ch)
if len(longest) < len(ch[i:n]):
longest = ch[i:n]
i = n+1
Test:
print (plus_long("there is a cat on a very big banana tree"))
print (plus_long("there is a cat on a mountain"))
print (plus_long("there"))
Output:
banana
mountain
there
ch=input("donner: ")
def plus_long(ch):
word = ''
maximum = ''
for letter in ch:
if letter == ' ':
if len(maximum) < len(word):
maximum = word
word = ''
else:
word += letter
return maximum
print("maximum est: ",plus_long(ch))
this is a very long and probably the worst method as far as Big 'O' Notation is concerned but it is a simple approach for beginners.
def longWord (sentence):
singleWord = ""
words = []
for letter in sentence:
if letter != " ":
singleWord += letter
else:
words += [singleWord]
singleWord = ""
words += [singleWord]
for i in range(len(words)-1):
biggestWord = ""
if len(words[i]) >= len(words[i+1]):
biggestWord += words[i]
else:
biggestWord += words[i+1]
print(words)
print(biggestWord)
longWord("This is a callback")
OUTPUT -
['This', 'is', 'a', 'callback']
callback
I know you already have a few answers. Here's a code with O(1)
ch=input("donner: ")
prev_pos = 0
max_word = ''
for i, sp in enumerate(ch):
if sp == ' ':
if len(ch[prev_pos:i]) > len(max_word): max_word = ch[prev_pos:i]
prev_pos = i+1
if len(ch[prev_pos:]) > len(max_word): max_word = ch[prev_pos:]
print ('longest word :', max_word, 'length :', len(max_word))
The output for this are:
donner: sentence
longest word : sentence length : 8
donner: this sentence
longest word : sentence length : 8
donner: this is a sentence
longest word : sentence length : 8
donner: this sentence is expectedly very lengthy
longest word : expectedly length : 10
donner: word is lengthy
longest word : lengthy length : 7
I wanted to give you an alternate approach as well. I have added comments for you to understand the code
ch=input("donner: ") #input the sentence
word = '' #capture each word in this variable
wd = [] #store all the words into this list
for sp in ch: #iterate thru the string
if sp != ' ': word += sp #concat to create the word
else:
wd.append(word) #add the word to list
word = '' #reset word
wd.append(word) #add the last word to the list
long_word = max(wd,key=len) #find the longest word
print ('longest word :',long_word, 'length :', len(long_word))

How can I detect multiple items in a list that are separated with a somewhat equivalent list in Python?

I'm coding in python version 3, and I got a list with positive "words", but some items hold a space:
posWords = ['beautiful', 'love', 'happy day', 'enjoy', 'smelling flowers']
However, the text I need to analyse on positive words aren't holding any spaces within items:
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
I want to iterate over wordList and when the algorithm sees words that are also in posWords, but merged (e.g. 'happy day'), remove the corresponding words in wordList ('happy', 'day') and add the merged version in wordList.
So in the end, the wordList must look like this:
wordList = ['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
BIG UPDATE:
Because I promised you guys to keep you updated, this is my code so far. It was kinda tricky because in my lists with positive words and negative words there where phrases that contained max three words. So I needed to figure out how to work with that. I realised (also because of the answers you guys gave me, thanks again!) that I had to make lists from all the words of the text that needed to be analysed with either 3, 2, or 1 words in one string item so I can check if the items also appear in my lists of positive words end negative words. Here is my code so far. It's kinda bulky, with a lot of copy pasting... Im planning to fix that but im quite tired and weekend is starting now, so no hate please! (tips are welcome tho)
from textblob import TextBlob
# open the files with negative words
negatives = open("neg_dutch_2.txt")
neg_list = []
# push all the words from text file to list
for lines in negatives:
lines = lines.lower()
neg_list.append(lines.strip("\n"))
neg_no_space = []
neg_one_space = []
neg_two_spaces = []
neg_three_spaces = []
count = 0
def neg_how_many_spaces(neg_list, neg_no_space, neg_one_space, neg_two_spaces,
neg_three_spaces, count):
# read every word in the list with negative words
for i in range(len(neg_list)):
# every word is a phrase, because there are "words" with spaces
phrase = neg_list[i]
# look at every character and assign the phrase to a list
# that correspondes with the number of spaces in it
for j in range(len(phrase)):
if phrase[j] == " ":
count += 1
if phrase[-1]:
if count == 1:
neg_one_space.append(phrase)
elif count == 2:
neg_two_spaces.append(phrase)
elif count == 3:
neg_three_spaces.append(phrase)
else:
neg_no_space.append(phrase)
# reset the counter to avoid the total sum of spaces in a list
count = 0
return neg_list, neg_no_space, neg_one_space, neg_two_spaces,
neg_three_spaces, count
neg_how_many_spaces(neg_list, neg_no_space, neg_one_space,
neg_two_spaces, neg_three_spaces, count)
# open the files with positive words
positives = open("pos_dutch_2.txt")
pos_list = []
# push all the words from text file to list
for lines in positives:
lines = lines.lower()
pos_list.append(lines.strip("\n"))
pos_no_space = []
pos_one_space = []
pos_two_spaces = []
pos_three_spaces = []
count = 0
def pos_how_many_spaces(pos_list, pos_no_space, pos_one_space, pos_two_spaces,
pos_three_spaces, count):
# read every word in the list with positive words
for i in range(len(pos_list)):
# every word is a phrase, because there are "words" with spaces
phrase = pos_list[i]
# look at every character and assign the phrase to a list
# that correspondes with the number of spaces in it
for j in range(len(phrase)):
if phrase[j] == " ":
count += 1
if phrase[-1]:
if count == 1:
pos_one_space.append(phrase)
elif count == 2:
pos_two_spaces.append(phrase)
elif count == 3:
pos_three_spaces.append(phrase)
else:
pos_no_space.append(phrase)
# reset the counter to avoid the total sum of spaces in a list
count = 0
return pos_list, pos_no_space, pos_one_space, pos_two_spaces,
pos_three_spaces, count
pos_how_many_spaces(pos_list, pos_no_space, pos_one_space,
pos_two_spaces, pos_three_spaces, count)
text = open("nrc_sample.TXT")
# reading the article, using TextBlob library to seperate each word
text = text.read()
blob = TextBlob(text)
# these are words that are bound to the meta-deta of the articlesfile
ruis = ["DOCUMENTS", "SECTION", "LENGTH", "LOAD-DATE", "LANGUAGE",
"PUBLICATION-TYPE", "JOURNAL-CODE", "BYLINE", "All", "Rights",
"Reserved", "Copyright", "krant", "Krant", "KRANT", "blz"]
# make a list for all the words in the articles
word_list = []
# and store every word in that list
for word in blob.words:
if not any(x in word for x in ruis):
word = word.lower()
if word.isalpha():
word_list.append(word)
# variables for the frequencies of negative and positive words in articles
amount_pos = 0
amount_neg = 0
count = 0
phrases_four = []
phrases_three = []
phrases_two = []
phrases_one = []
amount_neg = 0
# PHRASE 4
for i in range(0, len(word_list)-4, 1):
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1], word_list[i+2], word_list[i+3]
phrase = phrase.join(strings)
phrases_four.append(phrase)
count = 0
for phrase in phrases_four:
print("phrase4", count, phrase)
count += 1
for neg in neg_three_spaces:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
# PHRASE 3
for i in range(0, len(word_list)-3, 1):
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1], word_list[i+2]
phrase = phrase.join(strings)
phrases_three.append(phrase)
count = 0
for phrase in phrases_three:
print("phrase3", count, phrase)
count += 1
for neg in neg_two_spaces:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
# PHRASE 2
# start at index zero, till one before end of the list
for i in range(0, len(word_list)-2, 1):
# until it hits the last word of the list, make for every two words in the
# article next to each other a phrase of two words, so we can check if
# there are phrases in the article who also exists in the pos or neg wordslists
if word_list[-1]:
phrase = " "
strings = word_list[i], word_list[i+1]
phrase = phrase.join(strings)
phrases_two.append(phrase)
count = 0
# examine each phrase, and check if the same phrase exists in the list
# with negative phrases containing two words
# dont forget to delete the counter, is only for readability
for phrase in phrases_two:
count += 1
for neg in neg_one_space:
if phrase == neg:
amount_neg += 1
print(amount_neg)
# JUST A WORD
for i in range(0, len(word_list)-1, 1):
if word_list[-1]:
phrase = word_list[i]
phrases_one.append(phrase)
count = 0
for phrase in phrases_one:
print("phrase1", count, phrase)
count += 1
for neg in neg_no_space:
if phrase == neg:
print("negatief woord^")
amount_neg += 1
print(amount_neg)
Here is a way to do it:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
# Create a sentence for the wordList.
joinedWordList = " ".join(wordList)
# Find all phrases in the posWords list.
phrases = [elem for elem in posWords if len(elem.split()) > 1]
# For every phrase, locate it in the sentence,
# count the space characters which is the same number as the index of the first word of phrase in the word list,
# insert the phrase and delete the word that combine the phrase from the wordList.
for phrase in phrases:
try:
i = joinedWordList.index(phrase)
spaces = len([letter for letter in joinedWordList[:i] if letter==' '])
wordList.insert(spaces,phrase)
del wordList[spaces+1:spaces+1 + len(phrase.split())]
except ValueError:
pass
print(wordList)
Output:
['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
Here is another approach that would work for any phrase length:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
for w in posWords:
nrWords = len(w.split(' '))
if nrWords > 1:
word_array = w.split(' ')
word_index_array = [wordList.index(w) for w in word_array]
index_difference_array = [abs(b-a) for a in word_index_array[0:-1] for b in word_index_array[1:]]
if sum(index_difference_array) == len(index_difference_array): #elements are consecutive in wordList
for elem in word_array:
wordList.remove(elem)
wordList.insert(word_index_array[0], w)
Output will be:
['I', 'enjoy', 'smelling flowers', 'on', 'a', 'happy day']
But if we for example input something like:
posWords = ['beautiful', 'love', 'happy day', 'enjoy','smelling flowers on']
wordList = ['I', 'enjoy', 'smelling', 'flowers', 'on', 'a', 'happy', 'day']
The output will be:
['I', 'enjoy', 'smelling flowers on', 'a', 'happy day']
Another way to do it:
>>> m=["good bad", "enjoy", "play"]
>>> l=["good", "bad", "happy", "delight"]
>>>
>>> for e in m:
... tmp = e.split(" ")
... if(len(tmp) > 1):
... l = [ent for ent in l if ent not in tmp]
... l.append(" ".join(tmp))
...
>>>
>>> l
['happy', 'delight', 'good bad']

python if multiple string return the words that contains in the sentences

I have a list of words and I want to do if statement, below is my list:
list = ['camera','display','price','memory'(will have 200+ words in the list)]
Here is my code:
def check_it(sentences):
if 'camera' in sentences and 'display' in sentences and 'price' in sentences:
return "Camera/Display/Price"
if 'camera' in sentences and 'display' in sentences:
return "Camera/Display"
...
return "Others"
h.loc[:, 'Category'] = h.Mention.apply(check_it)
There will be too many combinations for these and also I want to have the words return to row individually.
Does anyone know how to make this sample and return the word individually instead of doing 'camera/display/price'?
Use str.findall by regex - join all values of lists with |, last str.join values by /:
df = pd.DataFrame({'Mention':['camera in sentences and display in sentences',
'camera in sentences price']})
L = ['camera','display','price','memory']
pat = '|'.join(r"\b{}\b".format(x) for x in L)
df['Category'] = df['Mention'].str.findall(pat).str.join('/')
print (df)
Mention Category
0 camera in sentences and display in sentences camera/display
1 camera in sentences price camera/price
Another solution with list comprehension, also for list use generator with join:
df['Category1'] = [[y for y in x.split() if y in L] for x in df['Mention']]
df['Category2'] = ['/'.join(y for y in x.split() if y in L) for x in df['Mention']]
print (df)
Mention Category1 \
0 camera in sentences and display in sentences [camera, display]
1 camera in sentences price [camera, price]
Category2
0 camera/display
1 camera/price
some_words = ['camera','display','price','memory']
def check_it(sentences, words):
find_words = []
for word in words:
if word in sentences:
find_words.append(word)
return find_words
t = check_it('display has camera and price is', some_words)
print t
Why not just check the words in each sentence?
wordsList = ['camera','display','price','memory'(will have 200+ words in the list)]
def check_it(sentence, wordsList):
wordString = ''
flag = False
counter = 0
for word in sentence.split():
if word in wordsList:
if counter != 0:
wordString = wordString + '/' + word
else:
wordString = word
flag = True
counter += 1
if flag:
return wordString
elif not flag:
return 'Others'

MapReduce to count the frequency of the number consonants in words from a text file

I need a bit of help with Python code to count the frequency of consonants in a word. Consider the following sample input:
"There is no new thing under the sun."
Then the required output would be:
1 : 2
2 : 3
3 : 2
4 : 1
as there are 2 words with 1 consonant, 3 words with 2 consonants, 2 words with 3 consonants and 1 word with 4 consonants.
The following code does a similar job but instead of consonants it counts the frequency of whole words in text file. I know there is only a bit change which loops deeper into the word (I think).
def freqCounter(file1, file2):
freq_dict = {}
dict_static = {2:0, 3:0, 5:0}
# get rid of punctuation
punctuation = re.compile(r'[.?!,"\':;]') # use re.compile() function to convert string into a RegexObject.
try:
with open(file1, "r") as infile, open(file2, "r") as infile2: # open two files at once
text1 = infile.read() # read the file
text2 = infile2.read()
joined = " ".join((text1, text2))
for word in joined.lower().split():
#remove punctuation mark
word = punctuation.sub("", word)
#print word
l = len(word) # assign l tp be the word's length
# if corresponding word's length not found in dict
if l not in freq_dict:
freq_dict[l] = 0 # assign the dict key (the length of word) to value = 0
freq_dict[l] += 1 # otherwise, increase the value by 1
except IOError as e: # exception catch for error while reading the file
print 'Operation failed: %s' % e.strerror
return freq_dict # return the dictionary
Any help will be much appreciated!
I would try a simpler approach:
from collections import Counter
words = 'There is no new thing under the sun.'
words = words.replace('a', '').replace('e', '').replace('i', '').replace('o', '').replace('u', '') # you are welcome to replace this with a smart regex
# Now words have no more vowels i.e. only consonants
word_lengths = map(len, words.split(' '))
c = Counter(word_lengths)
freq_dict = dict(Counter(c))
A simple solution
def freqCounter(_str):
_txt=_str.split()
freq_dict={}
for word in _txt:
c=0
for letter in word:
if letter not in "aeiou.,:;!?[]\"`()'":
c+=1
freq_dict[c]=freq_dict.get(c,0)+ 1
return freq_dict
txt = "There is no new thing under the sun."
table=freqCounter(txt)
for k in table:
print( k, ":", table[k])
How about this?
with open('conts.txt', 'w') as fh:
fh.write('oh my god becky look at her butt it is soooo big')
consonants = "bcdfghjklmnpqrstvwxyz"
def count_cons(_file):
results = {}
with open(_file, 'r') as fh:
for line in fh:
for word in line.split(' '):
conts = sum([1 if letter in consonants else 0 for letter in word])
if conts in results:
results[conts] += 1
else:
results[conts] = 1
return results
print count_cons('conts.txt')
Missed the results
{1: 5, 2: 5, 3: 1, 4: 1}
[Finished in 0.0s]

Categories