Word frequency using dictionary - python

My problem is I can't figure out how to display the word count using the dictionary and refer
to keys length. For example, consider the following piece of text:
"This is the sample text to get an idea!. "
Then the required output would be
3 2
2 3
0 5
as there are 3 words of length 2, 2 words of length 3, and 0 words of length 5 in the
given sample text.
I got as far as displaying the list the word occurrence frequency:
def word_frequency(filename):
word_count_list = []
word_freq = {}
text = open(filename, "r").read().lower().split()
word_freq = [text.count(p) for p in text]
dictionary = dict(zip(text,word_freq))
return dictionary
print word_frequency("text.txt")
which diplays the dict in this format:
{'all': 3, 'show': 1, 'welcomed': 1, 'not': 2, 'availability': 1, 'television,': 1, '28': 1, 'to': 11, 'has': 2, 'ehealth,': 1, 'do': 1, 'get': 1, 'they': 1, 'milestone': 1, 'kroes,': 1, 'now': 3, 'bringing': 2, 'eu.': 1, 'like': 1, 'states.': 1, 'them.': 1, 'european': 2, 'essential': 1, 'available': 4, 'because': 2, 'people': 3, 'generation': 1, 'economic': 1, '99.4%': 1, 'are': 3, 'eu': 1, 'achievement,': 1, 'said': 3, 'for': 3, 'broadband': 7, 'networks': 2, 'access': 2, 'internet': 1, 'across': 2, 'europe': 1, 'subscriptions': 1, 'million': 1, 'target.': 1, '2020,': 1, 'news': 1, 'neelie': 1, 'by': 1, 'improve': 1, 'fixed': 2, 'of': 8, '100%': 1, '30': 1, 'affordable': 1, 'union,': 2, 'countries.': 1, 'products': 1, 'or': 3, 'speeds': 1, 'cars."': 1, 'via': 1, 'reached': 1, 'cloud': 1, 'from': 1, 'needed': 1, '50%': 1, 'been': 1, 'next': 2, 'households': 3, 'commission': 5, 'live': 1, 'basic': 1, 'was': 1, 'said:': 1, 'more': 1, 'higher.': 1, '30mbps': 2, 'that': 4, 'but': 2, 'aware': 1, '50mbps': 1, 'line': 1, 'statement,': 1, 'with': 2, 'population': 1, "europe's": 1, 'target': 1, 'these': 1, 'reliable': 1, 'work': 1, '96%': 1, 'can': 1, 'ms': 1, 'many': 1, 'further.': 1, 'and': 6, 'computing': 1, 'is': 4, 'it': 2, 'according': 1, 'have': 2, 'in': 5, 'claimed': 1, 'their': 1, 'respective': 1, 'kroes': 1, 'areas.': 1, 'responsible': 1, 'isolated': 1, 'member': 1, '100mbps': 1, 'digital': 2, 'figures': 1, 'out': 1, 'higher': 1, 'development': 1, 'satellite': 4, 'who': 1, 'connected': 2, 'coverage': 2, 'services': 2, 'president': 1, 'a': 1, 'vice': 1, 'mobile': 2, "commission's": 1, 'points': 1, '"access': 1, 'rural': 1, 'the': 16, 'agenda,': 1, 'having': 1}

def freqCounter(infilepath):
answer = {}
with open(infilepath) as infile:
for line in infilepath:
for word in line.strip().split():
l = len(word)
if l not in answer:
answer[l] = 0
answer[l] += 1
return answer
AlternativelyL
import collections
def freqCounter(infilepath):
with open(infilepath) as infile:
return collections.Counter(len(word) for line in infile for word in line.strip().split())

Use collections.Counter
import collections
sentence = "This is the sample text to get an idea"
Count = collections.Counter([len(a) for a in sentence.split()])
print Count

To count how many words in a text have given lengths: size -> frequency distribution, you could use a regular expression to extract words:
#!/usr/bin/env python3
import re
from collections import Counter
text = "This is the sample text to get an idea!. "
words = re.findall(r'\w+', text.casefold())
frequencies = Counter(map(len, words)).most_common()
print("\n".join(["%d word(s) of length %d" % (n, length)
for length, n in frequencies]))
Output
3 word(s) of length 2
3 word(s) of length 4
2 word(s) of length 3
1 word(s) of length 6
Note: It ignores the punctuation such as !. after 'idea' unlike .split()-based solutions automatically.
To read words from a file, you could read lines and extract words from them in the same way as it done for text in the first code example:
from itertools import chain
with open(filename) as file:
words = chain.from_iterable(re.findall(r'\w+', line.casefold())
for line in file)
# use words here.. (the same as above)
frequencies = Counter(map(len, words)).most_common()
print("\n".join(["%d word(s) of length %d" % (n, length)
for length, n in frequencies]))
In practice, you could use a list to find the length frequency distribution if you ignore words that are longer than a threshold:
def count_lengths(words, maxlen=100):
frequencies = [0] * (maxlen + 1)
for length in map(len, words):
if length <= maxlen:
frequencies[length] += 1
return frequencies
Example
import re
text = "This is the sample text to get an idea!. "
words = re.findall(r'\w+', text.casefold())
frequencies = count_lengths(words)
print("\n".join(["%d word(s) of length %d" % (n, length)
for length, n in enumerate(frequencies) if n > 0]))
Output
3 word(s) of length 2
2 word(s) of length 3
3 word(s) of length 4
1 word(s) of length 6

Related

Python - Check if a key exists in a large nested dictionary

So I have a large nested dictionary which has the following structure:
dic = {Review0: [{'there': 1, 'good': 3, 'news': 4, 'bad': 4, 'first': 3}],
Review1: [{'roomat': 1, 'recent': 1, 'bought': 1, 'explor': 1, 'sport': 1, 'suv': 2, 'realli': 3, 'nice': 4}],
Review2: [{'found': 2, 'pregnanc': 2, 'also': 1, 'nice': 1, 'explor': 1, 'result': 2}]}
So in order to look at the keys in Review0, I can index through dictionary like this dic[0]
I want to find a way to loop through the nested dictionary to check if a key exists from Review0 to ReviewN, so for example if I want to look for the word pregnanc it will find it in Review2 and return True.
Any ideas?
def yourfunc(dic):
for key, value in dic.items() :
if 'pregnanc' in value[0] :
return True
data = {'Review0': [{'there': 1, 'good': 3, 'news': 4, 'bad': 4, 'first': 3}],
'Review1': [{'roomat': 1, 'recent': 1, 'bought': 1, 'explor': 1, 'sport': 1, 'suv': 2, 'realli': 3, 'nice': 4}],
'Review2': [{'found': 2, 'pregnanc': 2, 'also': 1, 'nice': 1, 'explor': 1, 'result': 2}]}
print (yourfunc(data))
or if your "reviews" might have multiple items :
def yourfunc(dic):
for key, value in dic.items() :
for item in value :
if 'pregnanc' in item :
return True
let me know if this isn't what you're looking for.

from collections import defaultdict

why is it when I do not set default value of defaultdict to be zero (int), my below program does not give me results:
>>> doc
'A wonderful serenity has taken possession of my entire soul, like these sweet mornings of spring which I enjoy with my whole heart. I am alone, and feel the charm of existence in this spot, which was created for the bliss of souls like mine. I am so happy'
>>> some = defaultdict()
>>> for i in doc.split():
... some[i] = some[i]+1
...
Traceback (most recent call last):
File "<stdin>", line 2, in <module>
KeyError: 'A'
>>> some
defaultdict(None, {})
>>> i
'A'
yet it works with a default value
>>> some = defaultdict(int)
>>> for i in doc.split():
... some[i] = some[i]+1
...
>>> some
defaultdict(<class 'int'>, {'A': 1, 'wonderful': 1, 'serenity': 1, 'has': 1, 'taken': 1, 'possession': 1, 'of': 4, 'my': 2, 'entire': 1, 'soul,': 1, 'like': 2, 'these': 1, 'sweet': 1, 'mornings': 1, 'spring': 1, 'which': 2, 'I': 3, 'enjoy': 1, 'with': 1, 'whole': 1, 'heart.': 1, 'am': 2, 'alone,': 1, 'and': 1, 'feel': 1, 'the': 2, 'charm': 1, 'existence': 1, 'in': 1, 'this': 1, 'spot,': 1, 'was': 1, 'created': 1, 'for': 1, 'bliss': 1, 'souls': 1, 'mine.': 1, 'so': 1, 'happy': 1})
>>>
Could you tell why does it work like thus?
As the documentation says:
The first argument provides the initial value for the default_factory
attribute; it defaults to None. All remaining arguments are treated
the same as if they were passed to the dict constructor, including
keyword arguments.
Therefore, if you just write defaultdict without passing any value to the constructor, the default value is set to None
See the output:
some = defaultdict()
print(some) # defaultdict(None, {})
And when the value is set to None, you can not execute: some[i] = some[i]+1.
Thus, you have to set the default value to int explicitly: some = defaultdict(int)

Python print statement not printing anything at the end of my function

I want to print a phrase at the end of my function, but my desired output is not printing. There are no errors popping up in python, it just isn't printing and acting like it is ignoring it. wordlist is the list of words the user entered to find how many times each word appears in the website they entered. sitewordlist is the entire list of words in the website.
def count(wordlist, sitewordlist):
x = 0
while x < len(wordlist):
numblist = []
wordcount = sitewordlist.count(wordlist[x])
numblist.append(wordcount)
x = x + 1
final(numblist, wordlist)
def final(numblist, wordlist):
y = 0
while y < len(numblist):
print("The word" + wordlist[y] + "appears" + numblist[y] + "times.")
y = y + 1
main()
Problem: in your first while you increase x until it is equal to len(wordlist) - your second while is only entered if x is smaller then len(wordlist) - thats kind of contradictionary.
You can use collections.Counter to count things easily and get a dict from it:
from collections import Counter
def count(wordlist, sitewordlist):
data = Counter(sitewordlist)
for w in wordlist:
print(f"The word {w} appears {data.get(w,0)} times.")
text = """n 1066, William of Normandy introduced what, in later centuries, became referred
to as a feudal system, by which he sought the advice of a council of tenants-in-chief (a
person who held land) and ecclesiastics before making laws. In 1215, the tenants-in-chief
secured Magna Carta from King John, which established that the king may not levy or collect
any taxes (except the feudal taxes to which they were hitherto accustomed), save with the
consent of his royal council, which gradually developed into a parliament. Over the
centuries, the English Parliament progressively limited the power of the English monarchy
which arguably culminated in the English Civil War and the trial and execution of Charles
I in 1649. After the restoration of the monarchy under Charles II, and the subsequent
Glorious Revolution of 1688, the supremacy of Parliament was a settled principle and all
future English and later British sovereigns were restricted to the role of constitutional
monarchs with limited executive authority. The Act of Union 1707 merged the English
Parliament with the Parliament of Scotland to form the Parliament of Great Britain.
When the Parliament of Ireland was abolished in 1801, its former members were merged
into what was now called the Parliament of the United Kingdom.
(quote from: https://en.wikipedia.org/wiki/Parliament_of_England)""".split()
# some cleanup
text[:] = [t.strip(".,-!?1234567890)([]{}\n") for t in text]
words = ["is","and","not","are"]
count(words,text)
Output:
The word is appears 0 times.
The word and appears 6 times.
The word not appears 1 times.
The word are appears 0 times.
Full Counter:
Counter({'the': 22, 'of': 15, 'Parliament': 7, '': 6, 'and': 6, 'a': 5, 'which': 5,
'English': 5, 'in': 4, 'to': 4, 'were': 3, 'with': 3, 'was': 3, 'what': 2, 'later': 2,
'centuries': 2, 'feudal': 2, 'council': 2, 'tenants-in-chief': 2, 'taxes': 2, 'into': 2,
'limited': 2,'monarchy': 2, 'Charles': 2, 'merged': 2, 'n': 1, 'William': 1, 'Normandy': 1,
'introduced': 1, 'became': 1, 'referred': 1, 'as': 1, 'system': 1, 'by': 1, 'he': 1,
'sought': 1, 'advice': 1, 'person': 1, 'who': 1, 'held': 1, 'land': 1, 'ecclesiastics': 1,
'before': 1, 'making': 1, 'laws': 1, 'In': 1, 'secured': 1, 'Magna': 1, 'Carta': 1,
'from': 1, 'King': 1, 'John': 1, 'established': 1, 'that': 1, 'king': 1, 'may': 1,
'not': 1, 'levy': 1, 'or': 1, 'collect': 1, 'any': 1, 'except': 1, 'they': 1,
'hitherto': 1, 'accustomed': 1, 'save': 1, 'consent': 1, 'his': 1, 'royal': 1,
'gradually': 1, 'developed': 1, 'parliament': 1, 'Over': 1, 'progressively': 1, 'power': 1,
'arguably': 1, 'culminated': 1, 'Civil': 1, 'War': 1, 'trial': 1, 'execution': 1,
'I': 1, 'After': 1, 'restoration': 1, 'under': 1, 'II': 1, 'subsequent': 1, 'Glorious': 1,
'Revolution': 1, 'supremacy': 1, 'settled': 1, 'principle': 1, 'all': 1, 'future': 1,
'British': 1, 'sovereigns': 1, 'restricted': 1, 'role': 1, 'constitutional': 1,
'monarchs': 1, 'executive': 1, 'authority': 1, 'The': 1, 'Act': 1, 'Union': 1,
'Scotland': 1, 'form': 1, 'Great': 1, 'Britain': 1, 'When': 1, 'Ireland': 1,
'abolished': 1, 'its': 1, 'former': 1, 'members': 1, 'now': 1, 'called': 1, 'United': 1,
'Kingdom': 1, 'quote': 1, 'from:': 1,
'https://en.wikipedia.org/wiki/Parliament_of_England': 1})
While is not really appropriate here. You can simulate Counter using a normal dict and while like so:
def count_me_other(words,text):
wordlist = words.split()
splitted = (x.strip(".,!?") for x in text.split())
d = {}
it = iter(splitted)
try:
while it:
c = next(it)
if c not in d:
d[c]=1
else:
d[c]+=1
except StopIteration:
for w in wordlist:
print(f"The word {w} appears {d.get(w,0)} times.")
wordlist = "A C E G I K M"
text = "A B C D E F G A B C D E F A B C D E A B C D A B C A B A"
count_me_other(wordlist,text)
Output:
The word A appears 7 times.
The word C appears 5 times.
The word E appears 3 times.
The word G appears 1 times.
The word I appears 0 times.
The word K appears 0 times.
The word M appears 0 times.
Or use for ... in conjunction with a normal / defaultdict:
def count_me_other_2(words,text):
wordlist = words.split()
splitted = (x.strip(".,!?") for x in text.split())
d = {}
for w in splitted:
if w not in d:
d[w]=1
else:
d[w]+=1
for w in wordlist:
print(f"The word {w} appears {d.get(w,0)} times.")
def count_me_other_3(words,text):
from collections import defaultdict
wordlist = words.split()
splitted = (x.strip(".,!?") for x in text.split())
d = defaultdict(int)
for w in splitted:
d[w] += 1
for w in wordlist:
print(f"The word {w} appears {d.get(w,0)} times.")
count_me_other_2(wordlist,text)
count_me_other_3(wordlist,text)
with identical output.
You're using while-loops to act like for-loops, but you're using the same iterator x in both, and you're not resetting its value to 0 in between. So the second while-loop sees that x is already equal to len(wordlist), and so it doesn't execute the body of the loop.

Python code ignoring if statements

In the process of writing an app, I've come across a bizarre python code piece that I can't understand:
The Code:
msg = [108878, [[314.06, 2, 4.744018], [314.03, 1, 15.9059], [314.02, 2, 79.8338531], [314, 1, 54.90047253], [313.56, 2, 1.75392219], [313.53, 2, 15.61132219], [313.5, 1, 0.554316], [313.42, 1, 1.5976], [313.27, 1, 0.43344], [313.26, 1, 62.724], [313.25, 1, 2.57518855], [313.24, 1, 0.04], [313.09, 2, 22.51784808], [312.9, 1, 40], [312.82, 1, 26.65592034], [312.7, 1, 35.53791008], [312.62, 1, 0.46912], [312.61, 1, 100], [312.6, 1, 48.33502123], [312.57, 1, 4.24547326], [312.56, 2, 0.2], [312.5, 2, 109.76863639], [312.43, 1, 100], [312.42, 1, 0.11142352], [312.4, 1, 7.815571], [314.09, 2, -3.01187461], [314.14, 1, -1.27056771], [314.39, 2, -9.31898324], [314.46, 1, -0.01930229], [314.49, 1, -0.40344], [314.5, 1, -3.40637161], [314.53, 1, -0.2], [314.54, 2, -0.46432889], [314.57, 1, -0.04200538], [314.71, 1, -0.050949], [314.84, 1, -0.02153813], [314.88, 1, -0.04200538], [314.93, 1, -68.439], [314.94, 2, -7.477782], [314.95, 1, -5], [315.1, 1, -5.97], [315.12, 1, -0.01], [315.16, 1, -40], [315.2, 1, -0.04200538], [315.22, 1, -25.7525], [315.23, 1, -78.54523718], [315.38, 1, -80], [315.42, 1, -6.65060488], [315.47, 1, -20], [315.48, 1, -5.36]]]
bids = asks = {}
lenbids = lenasks = 0
for order in msg[1]:
if float(order[2])>0:
lenbids +=1
bids[order[0]]=order[2]
elif float(order[2])<0:
lenasks+=1
asks[order[0]]=-order[2]
print(len(bids),len(asks),lenbids,lenasks)
Output:
50 50 25 25
It seems to me as though python is behaving properly with regards to the lenbids/lenasks actions but is executing the second part of the if statement regardless of whether the condition is met or not?
I'm running Pycharm with Anaconda3 if that makes any difference.
Any help greatly appreciated!
You have set bids and asks to be the same dictionary:
bids = asks = {}
You want them to be different dictionaries
bids, asks = {}, {}

Count every word in a text file python

What i want is to be able to feed in a multiline Text file which is like a paragraph long and then to be returned with something like:
{'Total words': 'NUMBER', 'Words ending with LY': 'NUMBER'}
I have never used Counter before but i believe that is how i would do it. So i want it to count every word and if the word ends in LY add it to the second count. Considering i have never used Counter i don't know where to go...
with open('SOMETHING.txt') as f:
# something to do with counter here?
EDIT: I have to do it without using counter! how would i achieve the same result but without the counter library?
This should work for you...
def parse_file():
with open('SOMETHING.txt', 'r') as f:
c1 = 0
c2 = 0
for i in f:
w = i.split()
c1 += len(w)
for j in w:
if j.endswith('LY'):
c2 += 1
return {'Total words': c1, 'Words ending with LY': c2}
I would recommend however, you have a look at a few python basics.
Is this hard to try?
from collections import defaultdict
result = defaultdict(int)
result_second = defaultdict(int)
for word in open('text.txt').read().split():
result[word] += 1
if word.endswith('LY'):
result_second[word] +=1
print result,result_second
Output:
defaultdict(<type 'int'>, {'and': 1, 'Considering': 1, 'have': 2, "don't": 1, 'is': 1, 'it': 2, 'second': 1, 'want': 1, 'in': 1, 'before': 1, 'would': 1, 'to': 3, 'count.': 1, 'go...': 1, 'how': 1, 'add': 1, 'if': 1, 'LY': 1, 'it.': 1, 'do': 1, 'ends': 1, 'used': 2, 'that': 1, 'I': 1, 'Counter': 2, 'but': 1, 'So': 1, 'know': 1, 'never': 2, 'believe': 1, 'count': 1, 'word': 2, 'i': 5, 'every': 1, 'the': 2, 'where': 1})
Use collections.Counter()
import collections
with open('your_file.txt') as fp:
text = fp.read()
counter = collections.Counter(['ends_in_ly' if token.endswith('LY') else 'doesnt_end_in_ly' for token in text.split()])
Without counter
with open('file.txt') as fp:
tokens = fp.read().split()
c = sum([1 if token.endswith('LY') else 0 for token in tokens])
return {'ending_in_ly': c, 'not_ending_in_ly': len(tokens) - c}

Categories