Converting number words into numbers - Python [duplicate] - python

I need to convert one into 1, two into 2 and so on.
Is there a way to do this with a library or a class or anything?

The majority of this code is to set up the numwords dict, which is only done on the first call.
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
current = result = 0
for word in textnum.split():
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current
print text2int("seven billion one hundred million thirty one thousand three hundred thirty seven")
#7100031337

I have just released a python module to PyPI called word2number for the exact purpose. https://github.com/akshaynagpal/w2n
Install it using:
pip install word2number
make sure your pip is updated to the latest version.
Usage:
from word2number import w2n
print w2n.word_to_num("two million three thousand nine hundred and eighty four")
2003984

I needed something a bit different since my input is from a speech-to-text conversion and the solution is not always to sum the numbers. For example, "my zipcode is one two three four five" should not convert to "my zipcode is 15".
I took Andrew's answer and tweaked it to handle a few other cases people highlighted as errors, and also added support for examples like the zipcode one I mentioned above. Some basic test cases are shown below, but I'm sure there is still room for improvement.
def is_number(x):
if type(x) == str:
x = x.replace(',', '')
try:
float(x)
except:
return False
return True
def text2int (textnum, numwords={}):
units = [
'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
'sixteen', 'seventeen', 'eighteen', 'nineteen',
]
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
scales = ['hundred', 'thousand', 'million', 'billion', 'trillion']
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
if not numwords:
numwords['and'] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ''
onnumber = False
lastunit = False
lastscale = False
def is_numword(x):
if is_number(x):
return True
if word in numwords:
return True
return False
def from_numword(x):
if is_number(x):
scale = 0
increment = int(x.replace(',', ''))
return scale, increment
return numwords[x]
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
lastunit = False
lastscale = False
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if (not is_numword(word)) or (word == 'and' and not lastscale):
if onnumber:
# Flush the current number we are building
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
lastunit = False
lastscale = False
else:
scale, increment = from_numword(word)
onnumber = True
if lastunit and (word not in scales):
# Assume this is part of a string of individual numbers to
# be flushed, such as a zipcode "one two three four five"
curstring += repr(result + current)
result = current = 0
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
lastscale = False
lastunit = False
if word in scales:
lastscale = True
elif word in units:
lastunit = True
if onnumber:
curstring += repr(result + current)
return curstring
Some tests...
one two three -> 123
three forty five -> 345
three and forty five -> 3 and 45
three hundred and forty five -> 345
three hundred -> 300
twenty five hundred -> 2500
three thousand and six -> 3006
three thousand six -> 3006
nineteenth -> 19
twentieth -> 20
first -> 1
my zip is one two three four five -> my zip is 12345
nineteen ninety six -> 1996
fifty-seventh -> 57
one million -> 1000000
first hundred -> 100
I will buy the first thousand -> I will buy the 1000 # probably should leave ordinal in the string
thousand -> 1000
hundred and six -> 106
1 million -> 1000000

If anyone is interested, I hacked up a version that maintains the rest of the string (though it may have bugs, haven't tested it too much).
def text2int (textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
curstring = ""
onnumber = False
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
if onnumber:
curstring += repr(result + current) + " "
curstring += word + " "
result = current = 0
onnumber = False
else:
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
onnumber = True
if onnumber:
curstring += repr(result + current)
return curstring
Example:
>>> text2int("I want fifty five hot dogs for two hundred dollars.")
I want 55 hot dogs for 200 dollars.
There could be issues if you have, say, "$200". But, this was really rough.

I needed to handle a couple extra parsing cases, such as ordinal words ("first", "second"), hyphenated words ("one-hundred"), and hyphenated ordinal words like ("fifty-seventh"), so I added a couple lines:
def text2int(textnum, numwords={}):
if not numwords:
units = [
"zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
"nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
"sixteen", "seventeen", "eighteen", "nineteen",
]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion"]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
textnum = textnum.replace('-', ' ')
current = result = 0
for word in textnum.split():
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current`

Here's the trivial case approach:
>>> number = {'one':1,
... 'two':2,
... 'three':3,}
>>>
>>> number['two']
2
Or are you looking for something that can handle "twelve thousand, one hundred seventy-two"?

def parse_int(string):
ONES = {'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90,
}
numbers = []
for token in string.replace('-', ' ').split(' '):
if token in ONES:
numbers.append(ONES[token])
elif token == 'hundred':
numbers[-1] *= 100
elif token == 'thousand':
numbers = [x * 1000 for x in numbers]
elif token == 'million':
numbers = [x * 1000000 for x in numbers]
return sum(numbers)
Tested with 700 random numbers in range 1 to million works well.

Make use of the Python package: WordToDigits
pip install wordtodigits
It can find numbers present in word form in a sentence and then convert them to the proper numeric format. Also takes care of the decimal part, if present. The word representation of numbers could be anywhere in the passage.

This could be easily be hardcoded into a dictionary if there's a limited amount of numbers you'd like to parse.
For slightly more complex cases, you'll probably want to generate this dictionary automatically, based on the relatively simple numbers grammar. Something along the lines of this (of course, generalized...)
for i in range(10):
myDict[30 + i] = "thirty-" + singleDigitsDict[i]
If you need something more extensive, then it looks like you'll need natural language processing tools. This article might be a good starting point.

Made change so that text2int(scale) will return correct conversion. Eg, text2int("hundred") => 100.
import re
numwords = {}
def text2int(textnum):
if not numwords:
units = [ "zero", "one", "two", "three", "four", "five", "six",
"seven", "eight", "nine", "ten", "eleven", "twelve",
"thirteen", "fourteen", "fifteen", "sixteen", "seventeen",
"eighteen", "nineteen"]
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty",
"seventy", "eighty", "ninety"]
scales = ["hundred", "thousand", "million", "billion", "trillion",
'quadrillion', 'quintillion', 'sexillion', 'septillion',
'octillion', 'nonillion', 'decillion' ]
numwords["and"] = (1, 0)
for idx, word in enumerate(units): numwords[word] = (1, idx)
for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5,
'eighth':8, 'ninth':9, 'twelfth':12}
ordinal_endings = [('ieth', 'y'), ('th', '')]
current = result = 0
tokens = re.split(r"[\s-]+", textnum)
for word in tokens:
if word in ordinal_words:
scale, increment = (1, ordinal_words[word])
else:
for ending, replacement in ordinal_endings:
if word.endswith(ending):
word = "%s%s" % (word[:-len(ending)], replacement)
if word not in numwords:
raise Exception("Illegal word: " + word)
scale, increment = numwords[word]
if scale > 1:
current = max(1, current)
current = current * scale + increment
if scale > 100:
result += current
current = 0
return result + current

A quick solution is to use the inflect.py to generate a dictionary for translation.
inflect.py has a number_to_words() function, that will turn a number (e.g. 2) to it's word form (e.g. 'two'). Unfortunately, its reverse (which would allow you to avoid the translation dictionary route) isn't offered. All the same, you can use that function to build the translation dictionary:
>>> import inflect
>>> p = inflect.engine()
>>> word_to_number_mapping = {}
>>>
>>> for i in range(1, 100):
... word_form = p.number_to_words(i) # 1 -> 'one'
... word_to_number_mapping[word_form] = i
...
>>> print word_to_number_mapping['one']
1
>>> print word_to_number_mapping['eleven']
11
>>> print word_to_number_mapping['forty-three']
43
If you're willing to commit some time, it might be possible to examine inflect.py's inner-workings of the number_to_words() function and build your own code to do this dynamically (I haven't tried to do this).

There's a ruby gem by Marc Burns that does it. I recently forked it to add support for years. You can call ruby code from python.
require 'numbers_in_words'
require 'numbers_in_words/duck_punch'
nums = ["fifteen sixteen", "eighty five sixteen", "nineteen ninety six",
"one hundred and seventy nine", "thirteen hundred", "nine thousand two hundred and ninety seven"]
nums.each {|n| p n; p n.in_numbers}
results:
"fifteen sixteen"
1516
"eighty five sixteen"
8516
"nineteen ninety six"
1996
"one hundred and seventy nine"
179
"thirteen hundred"
1300
"nine thousand two hundred and ninety seven"
9297

I took #recursive's logic and converted to Ruby. I've also hardcoded the lookup table so its not as cool but might help a newbie understand what is going on.
WORDNUMS = {"zero"=> [1,0], "one"=> [1,1], "two"=> [1,2], "three"=> [1,3],
"four"=> [1,4], "five"=> [1,5], "six"=> [1,6], "seven"=> [1,7],
"eight"=> [1,8], "nine"=> [1,9], "ten"=> [1,10],
"eleven"=> [1,11], "twelve"=> [1,12], "thirteen"=> [1,13],
"fourteen"=> [1,14], "fifteen"=> [1,15], "sixteen"=> [1,16],
"seventeen"=> [1,17], "eighteen"=> [1,18], "nineteen"=> [1,19],
"twenty"=> [1,20], "thirty" => [1,30], "forty" => [1,40],
"fifty" => [1,50], "sixty" => [1,60], "seventy" => [1,70],
"eighty" => [1,80], "ninety" => [1,90],
"hundred" => [100,0], "thousand" => [1000,0],
"million" => [1000000, 0]}
def text_2_int(string)
numberWords = string.gsub('-', ' ').split(/ /) - %w{and}
current = result = 0
numberWords.each do |word|
scale, increment = WORDNUMS[word]
current = current * scale + increment
if scale > 100
result += current
current = 0
end
end
return result + current
end
I was looking to handle strings like two thousand one hundred and forty-six

This handles number in words of Indian style, some fractions, combination of numbers and words and also addition.
def words_to_number(words):
numbers = {"zero":0, "a":1, "half":0.5, "quarter":0.25, "one":1,"two":2,
"three":3, "four":4,"five":5,"six":6,"seven":7,"eight":8,
"nine":9, "ten":10,"eleven":11,"twelve":12, "thirteen":13,
"fourteen":14, "fifteen":15,"sixteen":16,"seventeen":17,
"eighteen":18,"nineteen":19, "twenty":20,"thirty":30, "forty":40,
"fifty":50,"sixty":60,"seventy":70, "eighty":80,"ninety":90}
groups = {"hundred":100, "thousand":1_000,
"lac":1_00_000, "lakh":1_00_000,
"million":1_000_000, "crore":10**7,
"billion":10**9, "trillion":10**12}
split_at = ["and", "plus"]
n = 0
skip = False
words_array = words.split(" ")
for i, word in enumerate(words_array):
if not skip:
if word in groups:
n*= groups[word]
elif word in numbers:
n += numbers[word]
elif word in split_at:
skip = True
remaining = ' '.join(words_array[i+1:])
n+=words_to_number(remaining)
else:
try:
n += float(word)
except ValueError as e:
raise ValueError(f"Invalid word {word}") from e
return n
TEST:
print(words_to_number("a million and one"))
>> 1000001
print(words_to_number("one crore and one"))
>> 1000,0001
print(words_to_number("0.5 million one"))
>> 500001.0
print(words_to_number("half million and one hundred"))
>> 500100.0
print(words_to_number("quarter"))
>> 0.25
print(words_to_number("one hundred plus one"))
>> 101

This code works for a series data:
import pandas as pd
mylist = pd.Series(['one','two','three'])
mylist1 = []
for x in range(len(mylist)):
mylist1.append(w2n.word_to_num(mylist[x]))
print(mylist1)

I find I faster way:
Da_Unità_a_Cifre = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11,
'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': 17, 'eighteen': 18, 'nineteen': 19}
Da_Lettere_a_Decine = {"tw": 20, "th": 30, "fo": 40, "fi": 50, "si": 60, "se": 70, "ei": 80, "ni": 90, }
elemento = input("insert the word:")
Val_Num = 0
try:
elemento.lower()
elemento.strip()
Unità = elemento[elemento.find("ty")+2:] # è uguale alla str: five
if elemento[-1] == "y":
Val_Num = int(Da_Lettere_a_Decine[elemento[0] + elemento[1]])
print(Val_Num)
elif elemento == "onehundred":
Val_Num = 100
print(Val_Num)
else:
Cifre_Unità = int(Da_Unità_a_Cifre[Unità])
Cifre_Decine = int(Da_Lettere_a_Decine[elemento[0] + elemento[1]])
Val_Num = int(Cifre_Decine + Cifre_Unità)
print(Val_Num)
except:
print("invalid input")

This code works only for numbers below 99. Both word to int and int to word (for rest need to implement 10-20 lines of code and simple logic. This is just simple code for beginners):
num = input("Enter the number you want to convert : ")
mydict = {'1': 'One', '2': 'Two', '3': 'Three', '4': 'Four', '5': 'Five','6': 'Six', '7': 'Seven', '8': 'Eight', '9': 'Nine', '10': 'Ten','11': 'Eleven', '12': 'Twelve', '13': 'Thirteen', '14': 'Fourteen', '15': 'Fifteen', '16': 'Sixteen', '17': 'Seventeen', '18': 'Eighteen', '19': 'Nineteen'}
mydict2 = ['', '', 'Twenty', 'Thirty', 'Fourty', 'fifty', 'sixty', 'Seventy', 'Eighty', 'Ninty']
if num.isdigit():
if(int(num) < 20):
print(" :---> " + mydict[num])
else:
var1 = int(num) % 10
var2 = int(num) / 10
print(" :---> " + mydict2[int(var2)] + mydict[str(var1)])
else:
num = num.lower()
dict_w = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, 'sixteen': 16, 'seventeen': '17', 'eighteen': '18', 'nineteen': '19'}
mydict2 = ['', '', 'twenty', 'thirty', 'fourty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninty']
divide = num[num.find("ty")+2:]
if num:
if(num in dict_w.keys()):
print(" :---> " + str(dict_w[num]))
elif divide == '' :
for i in range(0, len(mydict2)-1):
if mydict2[i] == num:
print(" :---> " + str(i * 10))
else :
str3 = 0
str1 = num[num.find("ty")+2:]
str2 = num[:-len(str1)]
for i in range(0, len(mydict2)):
if mydict2[i] == str2:
str3 = i
if str2 not in mydict2:
print("----->Invalid Input<-----")
else:
try:
print(" :---> " + str((str3*10) + dict_w[str1]))
except:
print("----->Invalid Input<-----")
else:
print("----->Please Enter Input<-----")

Related

How can I use my helper functions to get the correct output

So I created a helper function to help my main function in extracting stuff from a dictionary...
and here is my code and function
def rdict(recipes):
recipes_splitted = {}
for r in recipes:
recipe_name, parts = r.split(":")
recipe_parts = {}
for part in parts.split(','):
product, number = part.split('*')
recipe_parts[product] = int(number)
recipes_splitted[recipe_name] = recipe_parts
return recipes_splitted
def extract(recipes, data):
result = []
for r in recipes:
tmp = []
for key in data[r]:
tmp.append(f"{key}:{data[r][key]}")
final_string = ""
for i in range(len(tmp)):
if i < len(tmp) - 1:
final_string += tmp[i] + ", "
else:
final_string += tmp[i]
result.append(final_string)
return result
So what I'm trying to do is make sure data in extract(recipe, data) go through rdict(data) since rdict will convert data into a dictionary, which is what I need.. However, when I tried doing for key in rdict(data[r]): the output returns Error. String is not supscriptable..
what should I do to successfully implement the changes??
Edit
So from my current code, here is a sample input..
print(extract(recipes = ['T-Bone', 'Green Salad1'],data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
))
and in order for my code to work, it has to be like this
print(extract(recipes = ['T-Bone', 'Green Salad1'], data = {'Pork Stew': {'Cabbage': 5, 'Carrot': 1, 'Fatty Pork': 10}, 'Green Salad1': {'Cabbage': 10, 'Carrot': 2, 'Pineapple': 5},'T-Bone': {'Carrot': 2, 'Steak Meat': 1}}))
So from the input, data should be changed from
data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
to
data = {'Pork Stew': {'Cabbage': 5, 'Carrot': 1, 'Fatty Pork': 10}, 'Green Salad1': {'Cabbage': 10, 'Carrot': 2, 'Pineapple': 5},'T-Bone': {'Carrot': 2, 'Steak Meat': 1}}
Convert the data to dict in extract().
recipes = ['T-Bone', 'Green Salad1']
data = ["Pork Stew:Cabbage*5,Carrot*1,Fatty Pork*10",
"Green Salad1:Cabbage*10,Carrot*2,Pineapple*5",
"T-Bone:Carrot*2,Steak Meat*1"]
def rdict(recipes):
recipes_splitted = {}
for r in recipes:
recipe_name, parts = r.split(":")
recipe_parts = {}
for part in parts.split(','):
product, number = part.split('*')
recipe_parts[product] = int(number)
recipes_splitted[recipe_name] = recipe_parts
return recipes_splitted
def extract(recipes, data):
data = rdict(data) # convert data to dict first
result = []
for r in recipes:
tmp = []
for key in data[r]:
tmp.append(f"{key}:{data[r][key]}")
final_string = ""
for i in range(len(tmp)):
if i < len(tmp) - 1:
final_string += tmp[i] + ", "
else:
final_string += tmp[i]
result.append(final_string)
return result
print(extract(recipes, data))
Output:
['Carrot:2, Steak Meat:1', 'Cabbage:10, Carrot:2, Pineapple:5']
Renamed rdict to parse_recipe, and modified it to return a tuple that is lighter and easier to process
In extract:
a) Build a dict of recipes: data_recipes
b) Built result by getting the wanted recipes, with a guard against missing recipe (which be an empty dict:{} )
def parse_recipe(s):
recipe, ings_s = s.split(':')
ings_l = ings_s.split(',')
ings_d= {}
for ing in ings_l:
i,q = ing.split('*')
ings_d[i.strip()] = q.strip()
return recipe.strip(), ings_d
def extract(recipes, data):
data_recipes = {}
for s in data:
recipe, ings_d = parse_recipe(s)
data_recipes[recipe] = ings_d
return {r: data_recipes.get(r, dict()) for r in recipes}

KeyError for key that exists in dictionary [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 3 years ago.
Improve this question
I have a piece of code that has an array that loops through an array of keys for a dictionary. It gets each of these keys, applies it to the dictionary and then gets the returned value and uses it for spacing out my unicode table. Example:
sizeDict = {
"Name": 16,
"Mobile": 11,
"Pizza": 7,
"Drink": 7,
"Dessert": 7,
"Total": 7
}
header = ['name', 'mobile', 'pizza', 'drink', 'dessert', 'total']
def printRow(firstChar, endChar, space, specialChar, spaceArray, spaceDict):
output = firstChar
for i in range(0, len(spaceArray)):
if(i == len(spaceArray) - 1):
specialChar = endChar
output = output + space * spaceDict[spaceArray[i].title()] + specialChar
return output
print(printRow("┏", "┓", "━", "┳", header, sizeDict))
#Returns ┏━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┳━━━━━━━┓
The problem for some reason, is that it doesn't recognise some of the values from the array as actual keys in the dictionary, even though both strings are exactly alike. Here is the specific array, dictionary and error message where the error occured.
statHeader = ['Average Total', 'Earned', '% of Total']
statSizeDict = {
"Average Total": 15,
"Earned": 10,
"% of Total": 20
}
statSizeArray = [15,10,20]
"""
<ipython-input-3-5c5f25401e4d> in statOrders(statData)
272 for i in range(0, len(statHeader)):
273 tempString += spaceVal(statHeader[i], statSizeDict[statHeader[i]])
--> 274 print(printRow("┏", "┓", "━", "┳", statHeader, statSizeDict))
275 print(tempString)
276 print(printRow("┣", "┫", "━", "╋", statHeader, statSizeDict))
<ipython-input-3-5c5f25401e4d> in printRow(firstChar, endChar, space, specialChar, spaceArray, spaceDict)
51 if(i == len(spaceArray) - 1):
52 specialChar = endChar
---> 53 output = output + space * spaceDict[spaceArray[i].title()] + specialChar
54 return output
55 # ========================================================================================================================== #
KeyError: '% Of Total'
"""
Here is the complete code for reference:
import csv
import os
# ========================================================================================================================== #
data = []
testData = [8001, 499382, 'int', 'int', 'float']
header = ['name', 'mobile', 'pizza', 'drink', 'dessert', 'total']
statHeader = ['Average Total', 'Earned', '% of Total']
# ========================================================================================================================== #
orderDict = {
"Small cheese pizza": ['pizza', 5],
"Big cheese pizza": ['pizza', 10],
"Small drink": ['drink', 1],
"Large drink": ['drink', 1.5],
"Small dessert": ['dessert', 0.5],
"Large dessert": ['dessert', 1]
}
posDict = {
"Pizza Total": 2,
"Drink Total": 3,
"Dessert Total": 4
}
returnDict = {
1: 'Task completed succesfully.',
2: 'Task encountered an unexpected error.',
3: 'No file data was found.',
4: 'Value does not exist or is out of range.',
5: 'Value type is invalid or does not exist in current range.',
6: 'Value length was to large for function.'
}
sizeDict = {
"Name": 16,
"Mobile": 11,
"Pizza": 7,
"Drink": 7,
"Dessert": 7,
"Total": 7
}
statSizeDict = {
"Average Total": 15,
"Earned": 10,
"% of Total": 20
}
statSizeArray = [15,10,20]
def spaceVal(value, size):
return str(value)[:size] + " " * (size-len(str(value))) + '┃'
def printRow(firstChar, endChar, space, specialChar, spaceArray, spaceDict):
output = firstChar
for i in range(0, len(spaceArray)):
if(i == len(spaceArray) - 1):
specialChar = endChar
output = output + space * spaceDict[spaceArray[i].title()] + specialChar
return output
# ========================================================================================================================== #
def newOrderCash(appendData, name, mobile, pizza=0, drink=0, dessert=0):
"""
Appends the formatted order to the data list and returns
the name of the customer and the total of their order.
Args:
appendData - a list which has the order
appended to the end of it.
name - a string value containing the customer's name.
mobile - a string or integer value which contains
the mobile number of the person making the order.
pizza - the total cost of the ordered pizza which
defaults to zero if no pizza was ordered.
drink - the total cost of the ordered pizza which
defaults to zero if no pizza was ordered.
dessert - the total cost of the ordered pizza which
defaults to zero if no pizza was ordered.
Returns:
A list containing the name of the customer and the
total cost of their order.
None - If name or mobile is not provided.
"""
if(name is None or name == "" or mobile is None or mobile == ""):
return None
if(len(name) > 21 or len(str(mobile)) > 10):
return 6
total = float(pizza) + float(drink) + float(dessert)
appendData.append([name, str(mobile), float(pizza), float(drink), float(dessert), total])
returnData = name, total
return list(returnData)
def newOrderItems(appendData, name, mobile, items):
"""
Appends the formatted order to the data list and returns
the name of the customer and the total of their order.
Args:
appendData - a list which has the order
appended to the end of it.
name - a string value containing the customer's name.
mobile - a string or integer value which contains
the mobile number of the person making the order.
items - a list which contains the items the
customer has ordered.
Returns:
A list containing the name of the customer and the
total cost of their order.
None - If name or mobile is not provided.
"""
if(name is None or name == "" or mobile is None or mobile == ""):
return None
total = 0
drink = 0
dessert = 0
pizza = 0
for i in items:
total += float(orderDict[i][1])
if(orderDict[i][0] == 'pizza'):
pizza += float(orderDict[i][1])
elif(orderDict[i][0] == 'drink'):
drink += float(orderDict[i][1])
elif(orderDict[i][0] == 'dessert'):
dessert += float(orderDict[i][1])
tempArray = name, str(mobile), pizza, drink, dessert, total
appendData.append(list(tempArray))
returnData = name, total
return list(returnData)
def newBulkOrders(appendData, names, mobiles, items):
"""
Appends the formatted order to the data list and returns
the names of the customer and the totals of their order.
Args:
appendData - a list which has the order
appended to the end of it.
name - a list containing the customers' names.
mobiles - a list containing the customers' mobiles.
items - a list of lists containing.
Returns:
A list containing the names of the customers
and the totals of their orders.
None - If name or mobile is not provided.
Task Code 4 - If a type error occurs.
"""
if(names is None or names == "" or mobiles is None or mobiles == ""):
return None
try:
returnTotals = []
returnNames = []
errorArray = []
returnList = []
for l in range(0, len(names)):
total = 0
drink = 0
dessert = 0
pizza = 0
tempItems = items[l]
for i in tempItems:
total += float(orderDict[i][1])
if(orderDict[i][0] == 'pizza'):
pizza += float(orderDict[i][1])
elif(orderDict[i][0] == 'drink'):
drink += float(orderDict[i][1])
elif(orderDict[i][0] == 'dessert'):
dessert += float(orderDict[i][1])
tempArray = names[l], str(mobiles[l]), float(pizza), drink, dessert, total
returnTotals.append(total)
returnNames.append(names[l])
errorArray.append(list(tempArray))
for x in range(0, len(errorArray)):
appendData.append(errorArray[x])
returnList = returnNames, returnTotals
return list(returnList)
except IndexError:
return 4
# ========================================================================================================================== #
def saveOrder(saveData, filename):
"""
Opens a file with name filename, and writes
saveData to it.
Args:
saveData - the list of orders which will be
written to the file.
filename - a string value which gives the name
of the file to be written to.
Returns:
Task Code 1 - If the task is complete successfully.
"""
writeFile = csv.writer(open(filename, 'w', newline=''))
writeFile.writerow(header)
for i in range(0, len(saveData)):
writeFile.writerow(saveData[i])
return 1
def getOrders(writeData, filename):
"""
Opens a file with name filename, and writes
saveData to it.
Args:
saveData - the list of orders which will be
written to the file.
filename - a string value which gives the name
of the file to be written to.
Returns:
Task Code 1 - If the task is complete successfully.
"""
if os.path.isfile("pythoncsv.csv"):
getFile = csv.reader(open(filename, 'r+', newline=''))
for i in getFile:
writeData.append(i)
# Getting rid of header row so that we don't get a bunch of TypeErrors.
writeData.pop(0)
for i in writeData:
i[2] = float(i[2])
i[3] = float(i[3])
i[4] = float(i[4])
i[5] = float(i[5])
print(writeData)
else:
# Creates file as above if statement tests whether it exists or not.
getFile = csv.reader(open(filename, 'w', newline=''))
return 1
# ========================================================================================================================== #
def printOrders(printData):
# Purpose of first for loop is to ensure that longer names do not cause printing errors.
space = 0
tempString = "┃"
for i in range(0, len(header)):
tempString += spaceVal(header[i].title(), sizeDict[header[i].title()])
print(printRow("┏", "┓", "━", "┳", header, sizeDict))
print(tempString)
print(printRow("┣", "┫", "━", "╋", header, sizeDict))
tempString = ""
for i in range(0, len(printData)):
tempString = "┃"
for x in range(0, len(printData[i])):
tempString += spaceVal(printData[i][x], sizeDict[header[x].title()])
print(tempString)
print(printRow("┗", "┛", "━", "┻", header, sizeDict))
print('\n')
return 1
# ========================================================================================================================== #
def statOrders(statData):
#try:
statArray = [[0, 'Pizzas'], [0, 'Drinks'], [0, 'Desserts'], [0, 'Overall']]
for i in range(0, len(statData)):
statArray[0][0] += statData[i][2]
statArray[1][0] += statData[i][3]
statArray[2][0] += statData[i][4]
statArray[3][0] += statData[i][5]
space = 0
tempString = "┃"
for i in range(0, len(statHeader)):
tempString += spaceVal(statHeader[i], statSizeDict[statHeader[i]])
print(printRow("┏", "┓", "━", "┳", statHeader, statSizeDict))
print(tempString)
print(printRow("┣", "┫", "━", "╋", statHeader, statSizeDict))
tempString = ""
for i in range(0, len(statArray)):
tempString = "┃"
tempString = tempString + spaceVal(statArray[i][1], statSizeArray[0]) + spaceVal(str(statArray[i][0]), statSizeArray[1]) + spaceVal(str(int(100 * statArray[i][0] / statArray[3][0])) + '%', statSizeArray[2])
print(tempString)
print(printRow("┗", "┛", "━", "┻", statHeader, statSizeDict))
print('\n')
return 1
#except (IndexError, TypeError):
# return 5
# ========================================================================================================================== #
def modifyOrder(modifyData, orderName, modifyValueType, newValue):
try:
for i in range(0, len(modifyData)):
if(modifyData[i][0] == orderName):
modifyData[i][posDict[modifyValueType]] = float(newValue)
modifyData[i][5] = float(sum(modifyData[i][2:5]))
return modifyData[i][5]
return 4
except (TypeError, IndexError):
return 2
# ========================================================================================================================== #
# End of Asserts: Task Failed Succesfully
# ========================================================================================================================== #
And the specific function where the error occurs:
def statOrders(statData):
#try:
statArray = [[0, 'Pizzas'], [0, 'Drinks'], [0, 'Desserts'], [0, 'Overall']]
for i in range(0, len(statData)):
statArray[0][0] += statData[i][2]
statArray[1][0] += statData[i][3]
statArray[2][0] += statData[i][4]
statArray[3][0] += statData[i][5]
space = 0
tempString = "┃"
for i in range(0, len(statHeader)):
tempString += spaceVal(statHeader[i], statSizeDict[statHeader[i]])
print(printRow("┏", "┓", "━", "┳", statHeader, statSizeDict))
print(tempString)
print(printRow("┣", "┫", "━", "╋", statHeader, statSizeDict))
tempString = ""
for i in range(0, len(statArray)):
tempString = "┃"
tempString = tempString + spaceVal(statArray[i][1], statSizeArray[0]) + spaceVal(str(statArray[i][0]), statSizeArray[1]) + spaceVal(str(int(100 * statArray[i][0] / statArray[3][0])) + '%', statSizeArray[2])
print(tempString)
print(printRow("┗", "┛", "━", "┻", statHeader, statSizeDict))
print('\n')
return 1
#except (IndexError, TypeError):
# return 5
# ========================================================================================================================== #
In fact
"% of Total".title() returns % Of Total.
This explains your error

Passing string over urls django

I'm having trouble in understanding this error in my code, first let me try and explain what is happening and what I'm I trying to do.
My code is designed to load up 45 separate text files into an array, including the weight of each word/phrase and the word phrase itself. This has to occur at the beginning, before any description is received.
Second, once the description is received, it is parsed by my software into words/phrases, which are compared to the words/phrases in the array.
Third, my software then provides the top three classes, in rank order (first/second/third) by number, along with the score for each class.
I've made a django application that will serve this code, so I have a form which will provide two parameters classes and description, like this:
class TrademarkClassifierForm(forms.Form):
"""
TODO: This forms will cover the questions the
initial classifier program does
:returns: TODO
"""
classes = forms.CharField(max_length=10,
label="Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
description = forms.CharField(widget=forms.Textarea)
def __init__(self, *args, **kwargs):
super(TrademarkClassifierForm, self).__init__(*args, **kwargs)
self.helper = FormHelper()
self.helper.add_input(Submit('submit', 'Submit'))
Then I want to pass this two parameters in the view over the url like this:
class TrademarkClassifierResultView(FormView):
"""
TODO: Post should redirect to it's on page with GET,
specify set values in some query parameters,
something like ?classes=yes&name=NameOfTrademarkClass
This should be visible on results page.
:param: classes
:param: description
:returns: TODO - params
"""
template_name = 'trademark.html'
form_class = TrademarkClassifierForm
def get(self, request, *args, **kwargs):
classes = str(self.request.GET.get('classes'))
description = str(self.request.GET.get('description'))
form = TrademarkClassifierForm(initial={'classes': classes, 'description': description})
context_data = self.get_context_data(classes, description, form=form)
return self.render_to_response(context_data)
def form_valid(self, form):
classes = form.cleaned_data['classes']
description = form.cleaned_data['description']
return redirect(self.get_success_url(classes, description))
def form_invalid(self, form):
messages.add_message(self.request, messages.ERROR,
"Invalid data. Please check fields.")
return self.render_to_response(
self.get_context_data(form=form)
)
def get_success_url(self, classes=None, description=None):
return reverse("classifier:trademark") + "?classes=" + str(classes) + "&description" + str(description)
def get_context_data(self, classes, description, **kwargs):
context = super(TrademarkClassifierResultView, self).get_context_data(**kwargs)
context['classes'] = classes
context['description'] = description
context['trademark'] = ClassifyMarkBased.control_program(classes, description)
return context
Now my problem is this error:
Environment:
Request Method: GET
Request URL: http://127.0.0.1:8000/trademark/
Django Version: 1.11.2
Python Version: 2.7.12
Installed Applications:
['django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.sites',
'classifier',
'crispy_forms',
'allauth',
'allauth.account',
'allauth.socialaccount',
'widget_tweaks',
'debug_toolbar']
Installed Middleware:
['django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
'debug_toolbar.middleware.DebugToolbarMiddleware']
Traceback:
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/exception.py" in inner
41. response = get_response(request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
187. response = self.process_exception_by_middleware(e, request)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/core/handlers/base.py" in _get_response
185. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in view
68. return self.dispatch(request, *args, **kwargs)
File "/home/petar/.virtualenvs/trademark/local/lib/python2.7/site-packages/django/views/generic/base.py" in dispatch
88. return handler(request, *args, **kwargs)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get
60. context_data = self.get_context_data(classes, description, form=form)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/views.py" in get_context_data
82. context['trademark'] = ClassifyMarkBased.control_program(classes, description)
File "/home/petar/Documents/Synergy/Trademark/TM_base/classifier/services/classify_mark_based.py" in control_program
89. N = len(word_count_array_for_all_classes[i])
Exception Type: IndexError at /trademark/
Exception Value: list index out of range
This is my url:
url(r'^trademark/', TrademarkClassifierResultView.as_view(), name="trademark"),
and this is the part of the code that should calculate the trademark over this two parameters:
import os
import numpy as np
import re
import requests
class TrademarkService(object):
# coding: utf-8
# In[5]:
# compare input string to a class
# for words not found,look in a dictionary - add to text files for trademark words
# In[6]:
# open each trademark class file and read the words/frequency back into an array
#staticmethod
def open_file_read_words(file_name):
unique_words_and_count_not_format = []
tm_word_count_array = []
my_list = []
all_possible_entries = 1
with open(file_name) as f:
lines = [line.strip() for line in open(file_name)]
all_possible_entries = len(lines)
tm_word_count_array = [[0 for x in range(2)] for y in range(all_possible_entries)]
i = 0
while i < all_possible_entries:
tm_word_count_array[i] = lines[i].split(',', 1)
i += 1
i = 0
while i < all_possible_entries:
tm_word_count_array[i][0] = int(tm_word_count_array[i][0])
i += 1
return tm_word_count_array
# In[7]:
# this section normalizes word frequency by the number of words x 1000
#staticmethod
def normalize_array(tm_word_count_array):
list_of_freqs = []
max_entries = len(tm_word_count_array)
list_of_freqs = [0 for y in range(max_entries)]
i = 0
while i < max_entries:
list_of_freqs[i] = tm_word_count_array[i][0]
i += 1
max_value = max(list_of_freqs)
i = 0
while i < max_entries:
tm_word_count_array[i][0] = ((float(tm_word_count_array[i][0])) / max_entries) * 1000
i += 1
return tm_word_count_array
# In[8]:
# include the list of not useful words here
#staticmethod
def find_not_useful_words(word):
not_useful_words = (
"about", "are", "upon", "-", " ", "up", "other", "or", "not", "namely", "more", "made", "in", "for", "except",
"but", "being", "all", "against", "was", "were", "will", "that", "its", "on", "it", "at", "was", "our", "your",
"ours", "yours", "their", "them", "other", "out", "having", "have", "has", "in", "be", "than", "use", "uses",
"using", "", "by", "and", "an", "a", "use", "used", "using", "for", "to", "of", "-)", "-]", "with", "as", "in",
"the", "from")
for test_word in not_useful_words:
if word == test_word:
return False
return True
# In[9]:
# clean up the phrases by removing problematic characters
#staticmethod
def clean_up_phrases(data):
important_words = ''
word = data
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
# continue_yes=raw_input("do you want to continue?")
word = word.lower()
word = str(filter(lambda ch: ch not in "?.!/;:,'()[]", word))
# calls the function above to remove words that were found to interfere with classification
if data.find_not_useful_words(word):
if len(word) > 1:
important_words += word
return important_words
# In[10]:
# find the important words in the string
#staticmethod
def find_important_words(data):
all_entries = len(data)
important_words = []
for word in data.split():
for c in word:
if 0 <= ord(c) <= 127:
# this is an ascii character.
not_a_variable = 0
else:
if ord(c) == 201:
word = word.replace(c, "e")
elif ord(c) == 241:
word = word.replace(c, "n")
elif ord(c) == 225:
word = word.replace(c, "a")
elif ord(c) == 251:
word = word.replace(c, "u")
elif ord(c) == 8206:
word = word.replace(c, "")
else:
word = word.replace(c, "")
word = word.lower()
word = str(filter(lambda ch: ch not in " ?.!/;:,'()[]", word))
if word.endswith("-"):
word = word[:-1]
if word.startswith("-"):
word = word[:1]
if data.find_not_useful_words(word):
if len(word) > 1:
important_words.append(word)
return important_words
# In[11]:
#staticmethod
def analyze_each_line_test_data(test_sentence, N, normalized_tm_word_count_array):
# remove problematic characters and words, plus find important words/phrases
test_important_phrases = test_sentence.clean_up_phrases(test_sentence)
i = 0
total_found = 0
total_TM_class_count = 0
total_TM_words_matched = []
# score the trademark phrases in the string
while i < N:
count_phrases = 0
if len(normalized_tm_word_count_array[i][1].split()) > 1:
if test_important_phrases.find(normalized_tm_word_count_array[i][1]) > -1:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0])
total_found += 1
i += 1
# decompose the string and remove extraneous words, then score the words in the string
test_important_words = test_sentence.find_important_words(test_sentence)
i = 0
while i < N:
count_words = 0
if test_important_words.count(normalized_tm_word_count_array[i][1]) > 0:
total_TM_words_matched.append(normalized_tm_word_count_array[i][1])
count_words = test_important_words.count(normalized_tm_word_count_array[i][1])
total_TM_class_count += (normalized_tm_word_count_array[i][0] * count_words)
total_found += 1
i += 1
i = 0
normalized_tm_word_count_values = [0 for y in range(N)]
normalized_tm_word_count_words = ['a' for y in range(N)]
while i < N:
normalized_tm_word_count_values[i] = normalized_tm_word_count_array[i][0]
normalized_tm_word_count_words[i] = normalized_tm_word_count_array[i][1]
i += 1
total_words_to_match = len(test_important_words) + len(test_important_phrases)
not_found_words = list(set(test_important_words) - set(normalized_tm_word_count_words))
return total_found, total_TM_words_matched, not_found_words, total_TM_class_count
# In[12]:
#staticmethod
def open_class_file_read_words_to_array(file_name, file_name_class=None):
tm_word_count_array = []
tm_word_count_array = file_name.open_file_read_words(file_name_class)
return tm_word_count_array
# In[13]:
# create a file for the trademark results
#staticmethod
def create_results_file(file_name, results_array, description):
unique_words_and_count_not_format = []
unique_words_and_count_to_write = []
open_file_name = open(file_name, 'a')
open_file_name.write("New trademark comparison")
open_file_name.write("\n")
open_file_name.write(description)
open_file_name.write("\n")
unique_words_and_count_to_write = np.array(results_array, dtype=object)
np.savetxt(open_file_name, unique_words_and_count_to_write, fmt='%s', delimiter=',')
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.write("\n")
open_file_name.close()
# In[14]:
# this section controls the program
#staticmethod
def control_the_program(classes, description):
description = []
word_count_array_for_all_classes = []
correct_class_set = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17',
'18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
'34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45']
# depending on the answer, only one class worth of trademark words will be loaded up or else all will be loaded up
# test_all_classes = raw_input(
# "Test all trademark classes? Type 'yes' to do so or else enter the class to be tested ")
test_all_classes = classes
# test description of goods/services
# test_data_array = raw_input("Provide the description of goods or services ")
test_data_array = description
# file_name_data = raw_input("Provide the identifier for the results file ")
# this file has the output of the classification engine, including the top 3 results
# file_name_results = 'user_test_comparison_results_' + file_name_data + '.txt'
# call to a program to open each file of trademark words in turn and read the words back into an array
if test_all_classes == 'yes':
i = 1
number_classes_to_check = 45
word_count_array_for_all_classes = [[] for z in range(46)]
temp_array = []
while i <= 45:
# opens each file with the trademark words
file_name_class = 'counted_phrases_class' + str(i) + '.txt'
temp_array = classes.open_class_file_read_words_to_array(file_name_class)
# normalization is used because some classes have many words and some have few words
# the words/phrases are weighted according to frequency
word_count_array_for_all_classes[i] = classes.normalize_array(temp_array)
i += 1
else:
# print "you didn't enter yes"
pass
# length_test_data_array = len(test_data_array)
# open(file_name_results, 'a').close()
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The start of the test")
# start_writing_results.write("\n")
# start_writing_results.write("Total number of potential items to match ")
# start_writing_results.write(str(length_test_data_array))
# start_writing_results.write("\n")
# start_writing_results.close()
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
counter_for_9vs42 = 0
counter_for_data_errors = 0
top_result = [0 for y in range(2)]
second_result = [0 for y in range(2)]
third_result = [0 for y in range(2)]
top_array_words_not_found = []
second_array_words_not_found = []
third_array_words_not_found = []
actual_class_results = [0 for y in range(2)]
overall_array_results = [[0 for x in range(3)] for y in range(4)]
actual_class_words_not_found = []
i = 1
while i <= 45:
total_found = 0
total_TM_words_matched = 0
not_found_words = ['']
score = 0
N = len(word_count_array_for_all_classes[i])
total_found, total_TM_words_matched, not_found_words, score = classes.analyze_each_line_test_data(test_data_array, N,
word_count_array_for_all_classes[i])
if int(score) > 0:
if int(score) > top_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = top_result[0]
second_result[1] = top_result[1]
second_array_words_not_found = top_array_words_not_found
top_result[0] = int(score)
top_result[1] = i
top_array_words_not_found = ['']
top_array_words_not_found = not_found_words
elif int(score) > second_result[0]:
third_result[0] = second_result[0]
third_result[1] = second_result[1]
third_array_words_not_found = second_array_words_not_found
second_result[0] = int(score)
second_result[1] = i
second_array_words_not_found = ['']
second_array_words_not_found = not_found_words
elif int(score) > third_result[0]:
third_result[0] = int(score)
third_result[1] = i
third_array_words_not_found = ['']
third_array_words_not_found = not_found_words
i += 1
overall_array_results[0][0] = top_result[0]
overall_array_results[0][1] = top_result[1]
overall_array_results[0][2] = top_array_words_not_found
overall_array_results[1][0] = second_result[0]
overall_array_results[1][1] = second_result[1]
overall_array_results[1][2] = second_array_words_not_found
overall_array_results[2][0] = third_result[0]
overall_array_results[2][1] = third_result[1]
overall_array_results[2][2] = third_array_words_not_found
# all results - including the first, second, third choices of the engine and the original description - are written to the file
# create_results_file(file_name_results, overall_array_results, test_data_array)
# start_writing_results = open(file_name_results, 'a')
# start_writing_results.write("The end of the test")
# start_writing_results.write("\n")
#
# start_writing_results.write("\n")
# start_writing_results.write("\n")
# start_writing_results.close()
# print "finished the process"
From the code that I've provided you can see that this parameters where provided over python raw_input and after calculation code was creating a file in which you can read about the result.
I've rewritten this so I can serve it over the django application, so parameters classes and description should overwrite the raw_input and the result will be displayed in the template, like this:
{{ trademark.overall_array_results.top_result }}<br>
{{ trademark.overall_array_results.second_result }}<br>
{{ trademark.overall_array_results.third_result }}
I'm not sure if I'm doing the write thing here, so I need help to understand this better, can someone help me to over come error.
If classes is not "yes", then word_count_array_for_all_classes remains an empty list.

Try every weighted combination of letters from the text result of tesseract

I've been testing text recognition from images using pyocr (tesseract-ocr and libetesseract). I've been applying various PIL.ImageFilters and getting the result of one specific string in the image. It has not been accurate, but I have 14 different results. Between all of them, all of the correct letters of the string in the image are there. So I have enumerated each string and created a dict containing the characters' position as keys that contain a dict of each character that has appeared in that position at keys and the number of occurrences as the value. Here's a shortened example
String In Image:
2HG2
Results:
#Note: this is not the actual order in which the strings are produced
2HC2
2HC2
2HCZ
2HOZ
2HOZ
2HOZ
2HOZ
2HGZ
2HGZ
2HGZ
ZHGZ
ZHGZ
ZH6Z
ZN6z
Dictionary:
{
0: {
u'2': 10,
u'Z': 4
}, 1: {
u'H': 13,
u'N': 1
}, 2: {
u'C': 3,
u'O': 4,
u'G': 5,
u'6': 2
}, 3: {
u'2': 2,
u'Z': 11,
u'z': 1
}
}
I'd like to try each combination of letters in each position until I get 2HG2. Any help would be appreciated.
EDIT:
The goal I'm trying to achieve is to scan a car registration, get text from it, and then populate a form with the data. As a proof of concept, I'm trying to get the VIN number from my person registration. At the moment, I'm (most likely naively) applying a series of PIL.ImageFilters and getting text from each. Below is my script.
import re
from itertools import permutations
from PIL import Image, ImageFilter
import pyocr
from pyocr import builders
vins = []
characters = {}
def validate(vincode):
"""
Validation code from https://en.wikipedia.org/wiki/Vehicle_identification_number
"""
maps = "0123456789X"
weights = [
8, 7, 6, 5, 4, 3, 2, 10, 0, 9, 8, 7, 6, 5, 4, 3, 2
]
table = {
"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9,
"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "H": 8,
"J": 1, "K": 2, "L": 3, "M": 4, "N": 5, "P": 7, "R": 9,
"S": 2, "T": 3, "U": 4, "V": 5, "W": 6, "X": 7, "Y": 8, "Z": 9,
}
if not isinstance(vincode, str) and not isinstance(vincode, unicode):
return False
if len(vincode) != 17:
return False
vincode = vincode.upper()
if "I" in vincode or "O" in vincode or "Q" in vincode:
return False
total = 0
for index, value in enumerate(vincode):
try:
products = table[value] * weights[index]
except KeyError:
break
total += products
index = total % 11
return maps[index] == vincode[8]
def get_text(tools_, img_):
for tool in tools_:
if tool.get_name() == 'Cuneiform (sh)':
continue
# print '=======================\nUsing {}\n======================='.format(tool.get_name())
boxes = tool.image_to_string(img_, lang='eng', builder=builders.WordBoxBuilder())
global vins
pattern = re.compile('[\W_]+')
vins += [pattern.sub('', x.content) for x in boxes if len(pattern.sub('', x.content)) == 17]
# boxes = [x for x in boxes if len(x.content.strip()) != 0]
# print boxes[3].content
# for box in boxes:
# print box.content
def apply_filters_and_get_text(img_, filter_):
for x in range(1, 5):
print 'Applying {} size: {}'.format(str(filter_), x)
try:
img_ = img_.filter(filter_(x))
except ValueError:
print 'error on {} size: {}'.format(str(filter_), x)
continue
img_.save('tmp{}-{}.jpg'.format(str(filter_), x))
get_text(tools, img_)
def count_occurrences(value):
global characters
for index, c in enumerate(value):
if index in characters and c in characters[index]:
characters[index][c] += 1
continue
if index in characters and isinstance(characters[index], dict):
characters[index][c] = 1
continue
characters[index] = {c: 1}
tools = pyocr.get_available_tools()
img = Image.open('images/test18.jpg')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.convert('L')
# get_text(tools)
# img = img.filter(ImageFilter.MaxFilter(5))
# img = img.filter(ImageFilter.SHARPEN)
# img = img.filter(ImageFilter.SMOOTH_MORE)
# get_text(tools)
# get_text(tools)
img = img.point(lambda x: 0 if x < 128 else 255, '1')
apply_filters_and_get_text(img, ImageFilter.MedianFilter)
apply_filters_and_get_text(img, ImageFilter.MinFilter)
apply_filters_and_get_text(img, ImageFilter.MaxFilter)
apply_filters_and_get_text(img, ImageFilter.ModeFilter)
for vin in vins:
count_occurrences(vin)
# print vin
# print validate(vin)
print characters
I was able to figure out a recursive function that tries every combination of the letters with priority to characters with higher weight.
def determine_character(characters_, tried=[]):
next_character = ""
current_rank = 0
for ch in characters_:
if characters_[ch] > current_rank and ch not in tried:
next_character = ch
return next_character
def determine_weight(word):
global characters
weight = 0
for index, ch in enumerate(word):
weight += characters[index][ch]
return weight
def descramble(word="", index=0):
global characters
count = len(characters)
if index == count and validate(word):
global vin_count, valid_vins
vin_count += 1
valid_vins.append({'vin': word, 'weight': determine_weight(word)})
return {'word': word, 'done': True}
if index == count:
return False
tried = []
while len(tried) < len(characters[index]):
ch = determine_character(characters[index], tried)
tried.append(ch)
next_index = index + 1
descramble("{word}{ch}".format(word=word, ch=ch), next_index)

How do I sort out my links in decreasing order (I have the values to the links, (num_to_words(v)))

I was making web crawler and now I need sort algorithm which could sort my links in decreasing order to see which link has appeared most of the times in this web page. This is my code which I made in python:
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
all_links = defaultdict(int)
def webpages():
url = 'http://www.hm.com/lv/department/MEN'
source_code = requests.get(url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll ('a', {'class':' ', 'rel':'nofollow'}):
href = link.get('href')
print(href)
get_single_item_data(href)
return all_links
def get_single_item_data(item_url):
source_code = requests.get(item_url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a'):
href = link.get('href')
if href and href.startswith('http://www.'):
if href:
all_links[href] += 1
print(href)
webpages()
units = ["", "one", "two", "three", "four", "five",
"six", "seven", "eight", "nine "]
teens = ["", "eleven", "twelve", "thirteen", "fourteen",
"fifteen", 'sixteen', "seventeen", "eighteen", "nineteen"]
tens = ["", "ten", "twenty", "thirty", "forty",
"fifty", "sixty", "seventy", "eighty", "ninety"]
thousands = ["", "thousand", "million", "billion", "trillion",
"quadrillion", "quintillion", "sextillion", "septillion", "octillion",
"nonillion", "decillion", "undecillion", "duodecillion", "tredecillion",
"quattuordecillion", "sexdecillion", "septendecillion", "octodecillion",
"novemdecillion", "vigintillion "]
def num_to_words(n):
words = []
if n == 0:
words.append("zero")
else:
num_str = "{}".format(n)
groups = (len(num_str) + 2) // 3
num_str = num_str.zfill(groups * 3)
for i in range(0, groups * 3, 3):
h = int(num_str[i])
t = int(num_str[i + 1])
u = int(num_str[i + 2])
print()
print(units[i])
g = groups - (i // 3 + 1)
if h >= 1:
words.append(units[h])
words.append("hundred")
if int(num_str) % 100: # if number modulo 100 has remainder add "and" i.e one hundred and ten
words.append("and")
if t > 1:
words.append(tens[t])
if u >= 1:
words.append(units[u])
elif t == 1:
if u >= 1:
words.append(teens[u])
else:
words.append(tens[t])
else:
if u >= 1:
words.append(units[u])
if g >= 1 and (h + t + u) > 0:
words.append(thousands[g])
return " ".join(words)
for k, v in webpages().items():
print(k, num_to_words(v))
If they are stored in an array you can sort the array.
For example:
# Array
a = [6, 2, 9, 3]
# sort the array
a.sort()
Maybe this link will also help: Link about sorting
Use sort function in python.
Help on built-in function sort: (copied from python help)
sort(...)
L.sort(cmp=None, key=None, reverse=False) -- stable sort *IN PLACE*;
cmp(x, y) -> -1, 0, 1
(END)
Now to do reverse sort use this:
>> L= [1,2,3,4]
>>> L.sort(reverse=True)
>>> L
[4, 3, 2, 1]
>>>
You can also use custom filter for comparsion.
sort will create a in-place sort and if you don't want that use sorted
>>> L=[1,2,3,4]
>>> sorted(L,reverse=True)
[4, 3, 2, 1]
>>> L
[1, 2, 3, 4]
>>>
dct = webpages()
for k in sorted(dct,key=dct.get,reverse=True):
print(k, num_to_words(dct[k]))
Or use itemgetter to sort the items:
from operator import itemgetter
for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
print(k, num_to_words(v))

Categories