Try every weighted combination of letters from the text result of tesseract - python

I've been testing text recognition from images using pyocr (tesseract-ocr and libetesseract). I've been applying various PIL.ImageFilters and getting the result of one specific string in the image. It has not been accurate, but I have 14 different results. Between all of them, all of the correct letters of the string in the image are there. So I have enumerated each string and created a dict containing the characters' position as keys that contain a dict of each character that has appeared in that position at keys and the number of occurrences as the value. Here's a shortened example
String In Image:
2HG2
Results:
#Note: this is not the actual order in which the strings are produced
2HC2
2HC2
2HCZ
2HOZ
2HOZ
2HOZ
2HOZ
2HGZ
2HGZ
2HGZ
ZHGZ
ZHGZ
ZH6Z
ZN6z
Dictionary:
{
0: {
u'2': 10,
u'Z': 4
}, 1: {
u'H': 13,
u'N': 1
}, 2: {
u'C': 3,
u'O': 4,
u'G': 5,
u'6': 2
}, 3: {
u'2': 2,
u'Z': 11,
u'z': 1
}
}
I'd like to try each combination of letters in each position until I get 2HG2. Any help would be appreciated.
EDIT:
The goal I'm trying to achieve is to scan a car registration, get text from it, and then populate a form with the data. As a proof of concept, I'm trying to get the VIN number from my person registration. At the moment, I'm (most likely naively) applying a series of PIL.ImageFilters and getting text from each. Below is my script.
import re
from itertools import permutations
from PIL import Image, ImageFilter
import pyocr
from pyocr import builders
vins = []
characters = {}
def validate(vincode):
"""
Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
"""
maps = "0123456789X"
weights = [
8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
]
table = {
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
"J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
"S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
}
if not isinstance(vincode, str) and not isinstance(vincode, unicode):
return False
if len(vincode) != 17:
return False
vincode = vincode.upper()
if "I" in vincode or "O" in vincode or "Q" in vincode:
return False
total = 0
for index, value in enumerate(vincode):
try:
products = table[value] * weights[index]
except KeyError:
break
total += products
index = total % 11
return maps[index] == vincode[8]
def get_text(tools_, img_):
for tool in tools_:
if tool.get_name() == 'Cuneiform (sh)':
continue
# print '=======================\nUsing {}\n======================='.format(tool.get_name())
boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
global vins
pattern = re.compile('[\W_]+')
vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
# boxes = [x for x in boxes if len(x.content.strip()) != 0]
# print boxes[3].content
# for box in boxes:
# print box.content
def apply_filters_and_get_text(img_, filter_):
for x in range(1, 5):
print 'Applying {} size: {}'.format(str(filter_), x)
try:
img_ = img_.filter(filter_(x))
except ValueError:
print 'error on {} size: {}'.format(str(filter_), x)
continue
img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
get_text(tools, img_)
def count_occurrences(value):
global characters
for index, c in enumerate(value):
if index in characters and c in characters[index]:
characters[index][c] += 1
continue
if index in characters and isinstance(characters[index], dict):
characters[index][c] = 1
continue
characters[index] = {c: 1}
tools = pyocr.get_available_tools()
img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)
for vin in vins:
count_occurrences(vin)
# print vin
# print validate(vin)
print characters

I was able to figure out a recursive function that tries every combination of the letters with priority to characters with higher weight.
def determine_character(characters_, tried=[]):
next_character = ""
current_rank = 0
for ch in characters_:
if characters_[ch] > current_rank and ch not in tried:
next_character = ch
return next_character
def determine_weight(word):
global characters
weight = 0
for index, ch in enumerate(word):
weight += characters[index][ch]
return weight
def descramble(word="", index=0):
global characters
count = len(characters)
if index == count and validate(word):
global vin_count, valid_vins
vin_count += 1
valid_vins.append({'vin': word, 'weight': determine_weight(word)})
return {'word': word, 'done': True}
if index == count:
return False
tried = []
while len(tried) < len(characters[index]):
ch = determine_character(characters[index], tried)
tried.append(ch)
next_index = index + 1
descramble("{word}{ch}".format(word=word, ch=ch), next_index)

Related

Unable to execute python exec inside aws lambda

I am trying to run a user-generated simple code (two sum) inside AWS lambda python runtime Runtime.PYTHON_3_9.
def handler(event, context):
a = {
"code": "def twoSum(nums, target):\n hash_map = {}\n\n for i, num in enumerate(nums):\n if target - num in hash_map:\n return list(sorted([i, hash_map[target - num]]))\n hash_map[num] = i\n\ndef solve():\r\n testcases = [[[2, 7, 11, 15], 9]]\r\n output = [[0, 1]]\r\n\r\n total = len(testcases)\r\n correct = 0\r\n\r\n for i in range(len(testcases)):\r\n result = twoSum(*testcases[i])\r\n\r\n if result == output[i]:\r\n correct += 1\r\n\r\n print(correct, total)\r\n\r\n\r\nsolve()\r\n"
}
code = a["code"]
print(code)
exec(code)
This code prints
def twoSum(nums, target):
hash_map = {}
for i, num in enumerate(nums):
if target - num in hash_map:
return list(sorted([i, hash_map[target - num]]))
hash_map[num] = i
def solve():
testcases = [[[2, 7, 11, 15], 9]]
output = [[0, 1]]
total = len(testcases)
correct = 0
for i in range(len(testcases)):
result = twoSum(*testcases[i])
if result == output[i]:
correct += 1
print(correct, total)
solve()
while the actual code is
def twoSum(nums, target):
hash_map = {}
for i, num in enumerate(nums):
if target - num in hash_map:
return list(sorted([i, hash_map[target - num]]))
hash_map[num] = i
def solve():
testcases = [[[2, 7, 11, 15], 9]]
output = [[0, 1]]
total = len(testcases)
correct = 0
for i in range(len(testcases)):
result = twoSum(*testcases[i])
if result == output[i]:
correct += 1
print(correct, total)
solve()
All the spaces inside the exec strings are getting removed for some reason. Tried the same code on local Windows and macOS. It ran without any issues.
And this is the error I received
{
"errorMessage": "name 'twoSum' is not defined",
"errorType": "NameError",
"requestId": "<some-request-id>",
"stackTrace": [
" File \"/var/task/python.py\", line 10, in handler\n exec(code)\n",
" File \"<string>\", line 25, in <module>\n",
" File \"<string>\", line 17, in solve\n"
]
}
Is there any way to resolve this and make the exec work?

Getting KeyError when assigning value with __getitem__ method

I want to realize the bert model.
So I built a class with __getitem__ in it.
I can print something like test[0], but when I assign a value, like data = test[0], a KeyError occurs.
import random
"""
corpus_file = 'vocab'
vocab_size = 6
vocab_freq = 1
save_path = 'obj/'
max_sentence = 16
corpus -> org_line -> ope_line
corpus -> org_line -> token_list -> idx_to_token + token_to_idx
"""
class vocab():
def __init__(self, corpus_file, vocab_size, vocab_freq,save_path,max_sentence):
self.max_sentence = max_sentence
self.special_labels = ['PAD', 'UNK', 'SEP', 'CLS', 'MASK']
# output
self.data = []
self.idx_to_token = []
self.token_to_idx = {}
# ope
self.pre_ope(corpus_file,vocab_size,vocab_freq)
#self.save_data(save_path)
#self.print_data()
def pre_ope(self,corpus_file,vocab_size,vocab_freq):
token_list = {}
with open(corpus_file, 'r') as f:
while 1:
new_org_line = f.readline()
if new_org_line != '':
new_org_line = new_org_line.strip('\n')
new_sentence = new_org_line.split('\t')
sentence = []
for tmp in new_sentence:
token_sentence = tmp.split()
sentence.append(token_sentence)
for token in token_sentence:
if token_list.get(token):
token_list[token] += 1
else:
new_token = {token: 1}
token_list.update(new_token)
self.data.append(sentence)
else:
break
f.close()
token_list = sorted(token_list.items(), key=lambda i: (-i[1], i[0]))
self.build_dictionary(token_list,vocab_freq,vocab_size)
'''
Special labels:
PAD
UNK
SEP sentence separator
CLS classifier token
MASK
'''
def build_dictionary(self,token_list,vocab_freq,vocab_size):
for idx, label in enumerate(self.special_labels):
self.idx_to_token.append(label)
self.token_to_idx[label] = idx
for idx, (token, freq) in enumerate(token_list):
if freq >= vocab_freq :
self.idx_to_token.append(token)
self.token_to_idx[token] = idx + len(self.special_labels)
if len(self.idx_to_token) >= vocab_size + len(self.special_labels) and vocab_size != 0 :
break
def __len__(self):
return len(self.data)
def print_data(self):
print(self.data)
print(self.idx_to_token)
print(self.token_to_idx)
def __getitem__(self, item):
s1,s2,is_next_sentence = self.get_random_next_sentence(item)
s1,s1_label = self.get_random_sentence(s1)
s2,s2_label = self.get_random_sentence(s2)
sentence = [self.token_to_idx['CLS']] +s1 +[self.token_to_idx['SEP']] +s2 +[self.token_to_idx['SEP']]
label = [-1] +s1_label +[-1] +s2_label +[-1]
if len(sentence) > self.max_sentence :
print('sentence is greater than the setting of max sentence')
for pos in range(len(sentence),self.max_sentence):
sentence.append(self.token_to_idx['PAD'])
label.append(-1)
return {
'token' : sentence,
'label' : label,
'is_next' : is_next_sentence
}
def get_random_next_sentence(self,item):
s1 = self.data[item][0]
s2 = self.data[item][1]
if random.random() < 0.5 :
is_next = 0
s2 = self.data[self.get_random_line(item)][1]
else:
is_next = 1
return s1,s2,is_next
def get_random_line(self,item):
rand = random.randint(0,len(self.data)-1)
while rand == item :
rand = random.randint(0,len(self.data)-1)
return rand
def get_random_sentence(self,sentence):
label = []
for idx,token in enumerate(sentence):
rand = random.random()
if rand < 0.15:
rand = rand/0.15
if rand < 0.8: #mask
sentence[idx] = self.token_to_idx['MASK']
elif rand < 0.9: #rand
sentence[idx] = random.randint(len(self.special_labels),len(self.token_to_idx)-1)
else: # still
sentence[idx] = self.token_to_idx[token]
label.append(self.token_to_idx[token])
else:
sentence[idx] = self.token_to_idx[token]
label.append(-1)
return sentence,label
if __name__ == '__main__':
test = vocab('vocab', 0, 1,'obj/',16)
print(len(test))
print(test[0])
print(test[1])
data = test[0]
Result:
2
{'token': [3, 4, 18, 12, 15, 11, 2, 7, 9, 13, 2, 0, 0, 0, 0, 0], 'label': [-1, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 'is_next': 0}
{'token': [3, 6, 4, 5, 8, 5, 17, 2, 16, 5, 14, 20, 2, 0, 0, 0], 'label': [-1, -1, 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], 'is_next': 0}
Traceback (most recent call last):
File "vocab.py", line 146, in <module>
data = test[0]
File "vocab.py", line 90, in ```__getitem__```
s1,s1_label = self.get_random_sentence(s1)
File "vocab.py", line 136, in get_random_sentence
sentence[idx] = self.token_to_idx[token]
KeyError: 4
vocab file:
hello this is my home nice to meet you
I want to go to school and have lunch
change the code:
def get_random_next_sentence(self,item):
s1 = self.data[item][0]
s2 = self.data[item][1]
if random.random() < 0.5 :
is_next = 0
s2 = self.data[self.get_random_line(item)][1]
else:
is_next = 1
return s1,s2,is_next
to:
def get_random_next_sentence(self,item):
s1 = copy.deepcopy(self.data[item][0])
s2 = copy.deepcopy(self.data[item][1])
if random.random() < 0.5 :
is_next = 0
s2 = copy.deepcopy(self.data[self.get_random_line(item)][1])
print(s2)
else:
is_next = 1
return s1,s2,is_next

List prints letters and not strings?

I'm having a hard time getting the following to print correctly:
core = 1, 2, 3, 4, 5
glutes = 6, 7, 8, 9, 10
upper = 11, 12, 13, 14, 15
lower = 16, 17, 18, 19, 20
conditioning = 21, 22, 23, 24, 25
core_ability = int(input("Core: "))
glute_ability = int(input("Glutes: "))
if core_ability > 4:
upper_ability = int(input("Upper body: "))
else:
""
lower_ability = int(input("Lower body: "))
conditioning_ability = int(input("\nConditioning ability level:"))
newcore = core[0:core_ability]
newglutes = glutes[0:glute_ability]
if core_ability > 4:
newupper = upper[0:upper_ability]
newlower = lower[0:lower_ability]
newconditioning = conditioning[0:conditioning_ability]
if core_ability > 4:
movement_bank = str(newcore) + str(newglutes) + str(newupper) + str(newlower) + str(conditioning_ability)
else:
movement_bank = str(newcore) + str(newglutes) + str(newlower) + str(conditioning_ability)
sections = int(input("\nNumber of GPP sections in the session: "))
print("\nSPECIFY THE NUMBER OF MOVEMENTS PER SECTION")
if sections == 1:
section1_num = int(input("Section 1:"))
print(random.sample(movement_bank[0:], k=section1_num))
I get an output the looks like:
' ', ' ', 'r'
when I'd like to get something like:
'1', '16', '8'
I added "str()" to each list in the "movement_bank" list because without it I got an error of: TypeError: can only concatenate list (not "int") to list.
All help is greatly appreciated.
It seems, you have different lists, and want to combine them all into one list.
Use extend:
core = 1, 2, 3, 4, 5
glutes = 6, 7, 8, 9, 10
upper = 11, 12, 13, 14, 15
lower = 16, 17, 18, 19, 20
conditioning = 21, 22, 23, 24, 25
movement_bank = []
core_ability = int(input("Core: "))
movement_bank.extend(core[:core_ability])
glute_ability = int(input("Glutes: "))
movement_bank.extend(glutes[:glute_ability])
if core_ability > 4:
upper_ability = int(input("Upper body: "))
movement_bank.extend(upper[:upper_ability])
lower_ability = int(input("Lower body: "))
movement_bank.extend(lower[:lower_ability])
conditioning_ability = int(input("\nConditioning ability level:"))
movement_bank.extend(conditioning[:conditioning_ability])
sections = int(input("\nNumber of GPP sections in the session: "))
print("\nSPECIFY THE NUMBER OF MOVEMENTS PER SECTION")
if sections == 1:
section1_num = int(input("Section 1:"))
print(random.sample(movement_bank, k=section1_num))

Nested dictionary replacing previous value + key instead of appending

I am working on vector space model, data set consists of 50 text files. Iterating through them splitting into words and saving them in dictionary. Now i want to use nested dictionary like:
dictionary = { {someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}},
{someword: {Doc1:23},{Doc21:2},{Doc34:3}}
}
but when i am running my program it replaces not only the document but also it does not calculates frequency by adding that how many times 'someword' occurred in a particular document.
for iterator in range(1, 51):
f = open(directory + str(iterator) + ext, "r")
for line in f.read().lower().split():
line = getwords(line)
for word in line:
if check(word, stopwords) == 0:
if existence(word, terms, iterator) != 1:
terms[word] = {}
terms[word]["Doc"+str(iterator)] = 1
else:
terms[word]["Doc"+str(iterator)] = int(terms[word]["Doc"+str(iterator)]) + 1
f.close()
existence function is :
def existence(tok, diction, iteration):
if tok in diction:
temp = "Doc"+str(iteration)
if temp in diction:
return 1
else:
return 0
else:
return 0
Result Somewhat like this.
{'blunder': {'Doc1': 1}, 'by': {'Doc50': 1}, 'anton': {'Doc27': 1}, 'chekhov': {'Doc27': 1}, 'an': {'Doc50': 1}, 'illustration': {'Doc48': 1}, 'story': {'Doc48': 1}, 'author': {'Doc48': 1}, 'portrait'...
Do you want to know how many times each word appears in each file? This is easily accomplished with a defaultdict of Counters, courtesy of the collections module.
You've got the right idea I think, looping over the files, reading line by line and splitting into words. It's the counting part you need help with.
from collections import defaultdict, Counter
from string import punctuation
fnames = ['1.txt', '2.txt', '3.txt', '4.txt', '5.txt']
word_counter = defaultdict(Counter)
for fname in fnames:
with open(fname, 'r') as txt:
for line in txt:
words = line.lower().strip().split()
for word in words:
word = word.strip(punctuation)
if word:
word_counter[word][fname] += 1
The data look will like this inside word_counter:
{
'within': {
'1.txt': 2,
},
'we': {
'1.txt': 3,
'2.txt': 2,
'3.txt': 2,
'4.txt': 2,
'5.txt': 4,
},
'do': {
'1.txt': 7,
'2.txt': 8,
'3.txt': 8,
'4.txt': 6,
'5.txt': 5,
},
...
}

Converting number words into numbers - Python [duplicate]

I need to convert one into 1, two into 2 and so on.
Is there a way to do this with a library or a class or anything?
The majority of this code is to set up the numwords dict, which is only done on the first call.
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
current = result = 0
for word in textnum.split():
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337
I have just released a python module to PyPI called word2number for the exact purpose. https://github.com/akshaynagpal/w2n
Install it using:
pip install word2number
make sure your pip is updated to the latest version.
Usage:
from word2number import w2n
print w2n.word_to_num("two million three thousand nine hundred and eighty four")
2003984
I needed something a bit different since my input is from a speech-to-text conversion and the solution is not always to sum the numbers. For example, "my zipcode is one two three four five" should not convert to "my zipcode is 15".
I took Andrew's answer and tweaked it to handle a few other cases people highlighted as errors, and also added support for examples like the zipcode one I mentioned above. Some basic test cases are shown below, but I'm sure there is still room for improvement.
def is_number(x):
if type(x) == str:
x = x.replace(',', '')
try:
float(x)
except:
return False
return True
def text2int (textnum, numwords={}):
units = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen',
]
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
if not numwords:
numwords['and'] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ''
onnumber = False
lastunit = False
lastscale = False
def is_numword(x):
if is_number(x):
return True
if word in numwords:
return True
return False
def from_numword(x):
if is_number(x):
scale = 0
increment = int(x.replace(',', ''))
return scale, increment
return numwords[x]
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
lastunit = False
lastscale = False
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if (not is_numword(word)) or (word == 'and' and not lastscale):
if onnumber:
# Flush the current number we are building
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
lastunit = False
lastscale = False
else:
scale, increment = from_numword(word)
onnumber = True
if lastunit and (word not in scales):
# Assume this is part of a string of individual numbers to
# be flushed, such as a zipcode "one two three four five"
curstring += repr(result + current)
result = current = 0
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
lastscale = False
lastunit = False
if word in scales:
lastscale = True
elif word in units:
lastunit = True
if onnumber:
curstring += repr(result + current)
return curstring
Some tests...
one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my zip is one two three four five -> my zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000 # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000
If anyone is interested, I hacked up a version that maintains the rest of the string (though it may have bugs, haven't tested it too much).
def text2int (textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ""
onnumber = False
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
if onnumber:
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
else:
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
if onnumber:
curstring += repr(result + current)
return curstring
Example:
>>> text2int("I want fifty five hot dogs for two hundred dollars.")
I want 55 hot dogs for 200 dollars.
There could be issues if you have, say, "$200". But, this was really rough.
I needed to handle a couple extra parsing cases, such as ordinal words ("first", "second"), hyphenated words ("one-hundred"), and hyphenated ordinal words like ("fifty-seventh"), so I added a couple lines:
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current`
Here's the trivial case approach:
>>> number = {'one':1,
... 'two':2,
... 'three':3,}
>>>
>>> number['two']
2
Or are you looking for something that can handle "twelve thousand, one hundred seventy-two"?
def parse_int(string):
ONES = {'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
}
numbers = []
for token in string.replace('-', ' ').split(' '):
if token in ONES:
numbers.append(ONES[token])
elif token == 'hundred':
numbers[-1] *= 100
elif token == 'thousand':
numbers = [x * 1000 for x in numbers]
elif token == 'million':
numbers = [x * 1000000 for x in numbers]
return sum(numbers)
Tested with 700 random numbers in range 1 to million works well.
Make use of the Python package: WordToDigits
pip install wordtodigits
It can find numbers present in word form in a sentence and then convert them to the proper numeric format. Also takes care of the decimal part, if present. The word representation of numbers could be anywhere in the passage.
This could be easily be hardcoded into a dictionary if there's a limited amount of numbers you'd like to parse.
For slightly more complex cases, you'll probably want to generate this dictionary automatically, based on the relatively simple numbers grammar. Something along the lines of this (of course, generalized...)
for i in range(10):
myDict[30 + i] = "thirty-" + singleDigitsDict[i]
If you need something more extensive, then it looks like you'll need natural language processing tools. This article might be a good starting point.
Made change so that text2int(scale) will return correct conversion. Eg, text2int("hundred") => 100.
import re
numwords = {}
def text2int(textnum):
if not numwords:
units = [ "zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen"]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion",
'quadrillion', 'quintillion', 'sexillion', 'septillion',
'octillion', 'nonillion', 'decillion' ]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5,
'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
current = result = 0
tokens = re.split(r"[\s-]+", textnum)
for word in tokens:
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
A quick solution is to use the inflect.py to generate a dictionary for translation.
inflect.py has a number_to_words() function, that will turn a number (e.g. 2) to it's word form (e.g. 'two'). Unfortunately, its reverse (which would allow you to avoid the translation dictionary route) isn't offered. All the same, you can use that function to build the translation dictionary:
>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
... word_form = p.number_to_words(i) # 1 -> 'one'
... word_to_number_mapping[word_form] = i
...
>>> print word_to_number_mapping['one']
1
>>> print word_to_number_mapping['eleven']
11
>>> print word_to_number_mapping['forty-three']
43
If you're willing to commit some time, it might be possible to examine inflect.py's inner-workings of the number_to_words() function and build your own code to do this dynamically (I haven't tried to do this).
There's a ruby gem by Marc Burns that does it. I recently forked it to add support for years. You can call ruby code from python.
require 'numbers_in_words'
require 'numbers_in_words/duck_punch'
nums = ["fifteen sixteen", "eighty five sixteen", "nineteen ninety six",
"one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
nums.each {|n| p n; p n.in_numbers}
results:
"fifteen sixteen"
1516
"eighty five sixteen"
8516
"nineteen ninety six"
1996
"one hundred and seventy nine"
179
"thirteen hundred"
1300
"nine thousand two hundred and ninety seven"
9297
I took #recursive's logic and converted to Ruby. I've also hardcoded the lookup table so its not as cool but might help a newbie understand what is going on.
WORDNUMS = {"zero"=> [1,0], "one"=> [1,1], "two"=> [1,2], "three"=> [1,3],
"four"=> [1,4], "five"=> [1,5], "six"=> [1,6], "seven"=> [1,7],
"eight"=> [1,8], "nine"=> [1,9], "ten"=> [1,10],
"eleven"=> [1,11], "twelve"=> [1,12], "thirteen"=> [1,13],
"fourteen"=> [1,14], "fifteen"=> [1,15], "sixteen"=> [1,16],
"seventeen"=> [1,17], "eighteen"=> [1,18], "nineteen"=> [1,19],
"twenty"=> [1,20], "thirty" => [1,30], "forty" => [1,40],
"fifty" => [1,50], "sixty" => [1,60], "seventy" => [1,70],
"eighty" => [1,80], "ninety" => [1,90],
"hundred" => [100,0], "thousand" => [1000,0],
"million" => [1000000, 0]}
def text_2_int(string)
numberWords = string.gsub('-', ' ').split(/ /) - %w{and}
current = result = 0
numberWords.each do |word|
scale, increment = WORDNUMS[word]
current = current * scale + increment
if scale > 100
result += current
current = 0
end
end
return result + current
end
I was looking to handle strings like two thousand one hundred and forty-six
This handles number in words of Indian style, some fractions, combination of numbers and words and also addition.
def words_to_number(words):
numbers = {"zero":0, "a":1, "half":0.5, "quarter":0.25, "one":1,"two":2,
"three":3, "four":4,"five":5,"six":6,"seven":7,"eight":8,
"nine":9, "ten":10,"eleven":11,"twelve":12, "thirteen":13,
"fourteen":14, "fifteen":15,"sixteen":16,"seventeen":17,
"eighteen":18,"nineteen":19, "twenty":20,"thirty":30, "forty":40,
"fifty":50,"sixty":60,"seventy":70, "eighty":80,"ninety":90}
groups = {"hundred":100, "thousand":1_000,
"lac":1_00_000, "lakh":1_00_000,
"million":1_000_000, "crore":10**7,
"billion":10**9, "trillion":10**12}
split_at = ["and", "plus"]
n = 0
skip = False
words_array = words.split(" ")
for i, word in enumerate(words_array):
if not skip:
if word in groups:
n*= groups[word]
elif word in numbers:
n += numbers[word]
elif word in split_at:
skip = True
remaining = ' '.join(words_array[i+1:])
n+=words_to_number(remaining)
else:
try:
n += float(word)
except ValueError as e:
raise ValueError(f"Invalid word {word}") from e
return n
TEST:
print(words_to_number("a million and one"))
>> 1000001
print(words_to_number("one crore and one"))
>> 1000,0001
print(words_to_number("0.5 million one"))
>> 500001.0
print(words_to_number("half million and one hundred"))
>> 500100.0
print(words_to_number("quarter"))
>> 0.25
print(words_to_number("one hundred plus one"))
>> 101
This code works for a series data:
import pandas as pd
mylist = pd.Series(['one','two','three'])
mylist1 = []
for x in range(len(mylist)):
mylist1.append(w2n.word_to_num(mylist[x]))
print(mylist1)
I find I faster way:
Da_Unità_a_Cifre = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11,
'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19}
Da_Lettere_a_Decine = {"tw": 20, "th": 30, "fo": 40, "fi": 50, "si": 60, "se": 70, "ei": 80, "ni": 90, }
elemento = input("insert the word:")
Val_Num = 0
try:
elemento.lower()
elemento.strip()
Unità = elemento[elemento.find("ty")+2:] # è uguale alla str: five
if elemento[-1] == "y":
Val_Num = int(Da_Lettere_a_Decine[elemento[0] + elemento[1]])
print(Val_Num)
elif elemento == "onehundred":
Val_Num = 100
print(Val_Num)
else:
Cifre_Unità = int(Da_Unità_a_Cifre[Unità])
Cifre_Decine = int(Da_Lettere_a_Decine[elemento[0] + elemento[1]])
Val_Num = int(Cifre_Decine + Cifre_Unità)
print(Val_Num)
except:
print("invalid input")
This code works only for numbers below 99. Both word to int and int to word (for rest need to implement 10-20 lines of code and simple logic. This is just simple code for beginners):
num = input("Enter the number you want to convert : ")
mydict = {'1': 'One', '2': 'Two', '3': 'Three', '4': 'Four', '5': 'Five','6': 'Six', '7': 'Seven', '8': 'Eight', '9': 'Nine', '10': 'Ten','11': 'Eleven', '12': 'Twelve', '13': 'Thirteen', '14': 'Fourteen', '15': 'Fifteen', '16': 'Sixteen', '17': 'Seventeen', '18': 'Eighteen', '19': 'Nineteen'}
mydict2 = ['', '', 'Twenty', 'Thirty', 'Fourty', 'fifty', 'sixty', 'Seventy', 'Eighty', 'Ninty']
if num.isdigit():
if(int(num) < 20):
print(" :---> " + mydict[num])
else:
var1 = int(num) % 10
var2 = int(num) / 10
print(" :---> " + mydict2[int(var2)] + mydict[str(var1)])
else:
num = num.lower()
dict_w = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': '17', 'eighteen': '18', 'nineteen': '19'}
mydict2 = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninty']
divide = num[num.find("ty")+2:]
if num:
if(num in dict_w.keys()):
print(" :---> " + str(dict_w[num]))
elif divide == '' :
for i in range(0, len(mydict2)-1):
if mydict2[i] == num:
print(" :---> " + str(i * 10))
else :
str3 = 0
str1 = num[num.find("ty")+2:]
str2 = num[:-len(str1)]
for i in range(0, len(mydict2)):
if mydict2[i] == str2:
str3 = i
if str2 not in mydict2:
print("----->Invalid Input<-----")
else:
try:
print(" :---> " + str((str3*10) + dict_w[str1]))
except:
print("----->Invalid Input<-----")
else:
print("----->Please Enter Input<-----")

Categories