Removing accents from keyword strings

Removing accents from keyword strings - python

This is a word processing code for chabot, in it it removes some articles and prepositions to make it easier for the bot to read
import json
from random import choice
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = input("Você: ")
res = ChatterMessage(k)
.response
print("Bot:", res)
How to remove accents from keyword strings to "make it easier" for chatbot to read? I found this explanation: How to remove string accents using Python 3? But I don't know how it would be applied to this code as the bot always stops responding

You could use the Python package unidecode that replaces special characters with ASCII equivalents.
from unidecode import unidecode
text = "Björn, Łukasz and Σωκράτης."
print(unidecode(text))
# ==> Bjorn, Lukasz and Sokrates.
You could apply this to both the input and keywords.
# In the function definition of reduce(), place this line of code after
# stopwords = ['de', 'a', 'o', .....])
stopwords = [unidecode(s) for s in stopwords]
# In "__main__": replace k = input("Você: ") with the following line of code.
k = unidecode(input("Você: "))
If it makes sense, you could also force the strings to be all lowercase. This will make your string comparisons even more robust.
k = unidecode(input("Você: ").lower())
Because you requested the entire code:
import json
from random import choice
from unidecode import unidecode
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
stopwords = [unidecode(s) for s in stopwords]
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = unidecode(input("Você: "))
res = ChatterMessage(k).response
print("Bot:", res)

Related

Program outputs incorrect string format with join method

I am trying to output in the format (a, b and c) from a function that should print common letters from 2 strings, but what I currently get is (a,b,and,c).
def task10(str1,str2):
str1=str1.lower()
str2=str2.lower()
ch1=""
ch2=""
for i in str1:
if i.isalpha():
ch1+=i
for k in str2:
if k.isalpha():
ch2+=k
common = list(set([c for c in ch1 if c in ch2]))
s=len(common)
if s==0:
common=['no common letters']
common.sort()
if s>1:
common.insert(-1," and ")
common= ', '.join(common)
common = common.replace(', and ,', 'and')
print(f'{common}')
task10("i like big cups","i cannot lie")

Do a string replace for ,and, with and to get the correct output.
def task10(str1,str2):
str1=str1.lower()
str2=str2.lower()
badchars = ['$','#','%',';',':','!',"*"," ",'1',"2","3","4","5","6","7","8","9","0",'^','&','#','~','?','[]','{',']',"+",'=','-','_','-',",",'"',"'",'`',"|","\\",'(',')']
for i in badchars:
str1=str1.replace(i,"")
str2=str2.replace(i,"")
common = list(set([c for c in str1 if c in str2]))
i=len(common)
if i==0:
common=['no common letters']
common.sort()
if i>1:
common.insert(-1,"and")
common= ','.join(common)
common = common.replace(',and,', ' and ')
print(f'{common}')
task10('icl!','i cannot lie!')
Output:
c,i and l

You can do this efficiently with set manipulation. Note that badchars is a set and not a list as in the original question.
badchars = {'$', '#', '%', ';', ':', '!', "*", " ", '1', "2", "3", "4", "5", "6", "7", "8", "9", "0", '^',
'&', '#', '~', '?', '[', '{', ']', "+", '=', '-', '_', '-', ",", '"', "'", '`', "|", "\\", '(', ')', '£'}
def task10(str1, str2):
match len(rv := (set(str1) & set(str2)) - badchars):
case 0:
return ''
case 1:
return rv.pop()
case _:
rv = list(rv)
return ', '.join(rv[:-1]) + ' and ' + rv[-1]
print(task10('ic!', 'i cannot lie!'))
Output:
i, c and l

Output will not become a string from a list

import os
import random
file = open('getty.txt')
filetext = file.read()
def getline(words,length):
ans=[]
total=0
while (length>total) and 0 != len(words):
word=words.pop(0)
total += len(word)+1 #add 1 for the space
ans.append(word)
#now we are one word too long
if total > length:
words.insert(0,ans.pop())
return ans
def printPara(words,length):
line = []
spaces = []
while len(words) != 0:
line.append(getline(words, length))
for z in range(0,len(line)):
for i in range(0,len(line[z])):
spaces = [[1] * len(line[i]) for i in range (len(line))]
for p in range (0,len(spaces)):
spaces[p][len(spaces[p])-1] = 0
if len(words) + len(spaces) != 0:
addSpace(line,spaces,length)
printLine(line,spaces)
else:
printLine(line,spaces)
def addSpace(line,spaces,length):
totalInt = 0
for i in range (0, len(line)):
totalInt = (len(spaces[i])-2) + len(line[i])
while length < totalInt:
num = random.randint(0, len(spaces) - 2)
spaces[num] += 1
return spaces
def printLine(line, spaces):
for i in range (len(line)):
print(str(line[i]) + (' ' * len(spaces[i])))
def main():
length = 75
textparagraph = filetext.split("\n\n")
para = [0] * len(textparagraph)
for i in range (0, len(textparagraph)):
para[i] = textparagraph[i]
words = [[0] * len(textparagraph) for i in range(len(para))]
for b in range (0,len(para)):
words[b] = para[b].split()
for z in range (0, len(para)):
printPara(words[z],length)
main()
My code outputs only lists of the separate lines and will not concatenate the two lists of words and spaces. How would I get it to output correctly?
Some exampes of output.
['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought', 'forth', 'on', 'this']
['continent,', 'a', 'new', 'nation,', 'conceived', 'in', 'Liberty,', 'and', 'dedicated', 'to', 'the']
['proposition', 'that', 'all', 'men', 'are', 'created', 'equal.']
['Now', 'we', 'are', 'engaged', 'in', 'a', 'great', 'civil', 'war,', 'testing', 'whether', 'that', 'nation,', 'or']
['any', 'nation', 'so', 'conceived', 'and', 'so', 'dedicated,', 'can', 'long', 'endure.', 'We', 'are', 'met', 'on', 'a']
Expected output "Four score and seven years ago..."

You can use
" ".join(["hello", "world"])
and you'll get
"hello world"

why does this it say in the console Process finished with exit code 0 instead of printing the 'sen' variable? [duplicate]

This question already has answers here:
How to check if type of a variable is string? [duplicate]
(22 answers)
Closed 2 years ago.
import random
import sys
def v1_debug(v1, subject):
if v1 != str and subject != str:
sys.exit()
else:
if subject == 'He' or 'She' or 'It':
for i in v1:
if i == [len(v1)+1]:
if i == 's' or 'z' or 'x' or 'o':
v1 = v1 + 'es'
elif i == 'y':
v1 = v1 - 'y' + 'ies'
elif v1[len(v1)] == 's' and v1[len(v1)+1] == 'h':
v1 = v1 + 'es'
elif v1[len(v1)] == 'c' and v1[len(v1)+1] == 'h':
v1 = v1 + 'es'
if subject == 'I' or 'You' or 'We' or 'They':
for i in v1:
if i == v1[len(v1)+1]:
v1 = v1 + 'ing'
return ''
def default_positive_form():
try:
sbj = ['He',
'She',
'It',
'I',
'You',
'We',
'They']
v1 = ['be',
'beat',
'become',
'begin',
'bend',
'bet',
'bid',
'bite',
'blow',
'break',
'bring',
'build',
'burn',
'buy',
'catch',
'choose',
'come',
'cost',
'cut',
'dig',
'dive',
'do',
'draw',
'dream',
'drive',
'drink',
'eat',
'fall',
'feel',
'fight',
'find',
'fly',
'forget',
'forgive',
'freeze',
'get',
'give',
'go',
'grow',
'hang',
'have',
'hear',
'hide',
'hit',
'hold',
'hurt',
'keep',
'know',
'lay',
'lead',
'leave',
'lend',
'let',
'lie',
'lose',
'make',
'mean',
'meet',
'pay',
'put',
'read',
'ride',
'ring',
'rise',
'run',
'say',
'see',
'sell',
'send',
'show',
'shut',
'sing',
'sit',
'sleep',
'speak',
'spend',
'stand',
'swim',
'take',
'teach',
'tear',
'tell',
'think',
'throw',
'understand',
'wake',
'wear',
'win',
'write']
sbj = random.choice(sbj)
v1 = random.choice(v1)
verb_debug = v1_debug(v1, sbj)
sen = ''
if sbj == 'I':
sen = sbj + 'am' + verb_debug
elif sbj == 'He' or 'She' or 'It':
sen = sbj + 'is' + verb_debug
elif sbj == 'You' or 'We' or 'They':
sen = sbj + 'are' + verb_debug
print(f'{sen}')
except NameError:
print('this is bullshit')
return
default_positive_form()
this is python 3.8

sen will only consist of an empty string if none of the conditions of your if/elif/elif blocks are met. Change the print line to
print(f"sen is: {sen}")
But that's not the real problem. obj != str does not check if obj is a string, it checks to see if the object is pointing to the type constant str (Thanks Charles Duffy for the comment). Instead, use the builtin function isinstance() like so:
if not isinstance(v1, str) and not isinstance(subject, str):
print("Variables are the wrong type!")
sys.exit()

Replace multiple instances of a sub-string with items in a list

I have a string like below:
e = "how are you how do you how are they how"
My expected output is:
out = "how1 are you how2 do you how3 are they how4"
I'm trying in the following way:
def givs(y,x):
tt = []
va = [i+1 for i in list(range(y.count(x)))]
for i in va:
tt.append(x+str(i))
return tt
ls = givs(e, 'how')
ls = ['how1', 'how2', 'how3', 'how4']
fg = []
for i in e.split(' '):
fg.append(i)
fg = ['how', 'are', 'you', 'how', 'do', 'you', 'how', 'are', 'they', 'how']
For every instance of 'how' in 'fg' I want to replace with items in 'ls' and finally use join function to get the required output.
expected_output = ['how1', 'are', 'you', 'how2', 'do', 'you', 'how3', 'are',
'they', 'how4']
so that I can join the items by:
' '.join(expected_output)
to get:
out = "how1 are you how2 do you how3 are they how4"

You could use itertools.count:
from itertools import count
counter = count(1)
e = "how are you how do you how are they how"
result = ' '.join([w if w != "how" else w + str(next(counter)) for w in e.split()])
print(result)
Output
how1 are you how2 do you how3 are they how4

There is no need to make your code complex, just add a counter and add it to every "how". At the end make the new string.
e = "how are you how do you how are they how"
e_ok = ""
count = 1
for i in e.split():
if i == "how":
i = i+str(count)
count += 1
e_ok += i + " "
print(e_ok)

Remove Stopwords in python

I'm developing an algorithm to remove stopword.
I am transforming a txt file into a list and thus passing in the algorithm for removal.
Example of file lines:
'mora vai nascer viver cair falar','positivo'
'deixa ver entendi vai crescer vai passar ve','positivo'
'so deveria ter foi agradeco de passei passei fez','positivo'
'nunca nao nao muito nao mais','negativo'
'a nao ate infelizmente ai ate quando','negativo'
'nao perto nao quanto menos nao sim nao nem simplesmente','negativo'
Code
with open('BasePalavras.txt') as arquivo:
baseTeste = [linha.strip() for linha in arquivo]
stopwords = ['a', 'agora', 'algum', 'alguma', 'aquele', 'aqueles', 'de', 'deu', 'do', 'e', 'estou', 'esta', 'esta',
'ir', 'meu', 'muito', 'mesmo', 'no', 'nossa', 'o', 'outro', 'para', 'que', 'sem', 'talvez', 'tem', 'tendo',
'tenha', 'teve', 'tive', 'todo', 'um', 'uma', 'umas', 'uns', 'vou']
def removestopword(texto):
frases=[]
for(palavras, emocao) in texto:
semstopwords = [p for p in palavras.splits() if p not in stopwords]
frases.append((semstopwords, emocao))
return frases
print (removestopword(baseTeste))
ERROR
Traceback (most recent call last):
File "C:/Users/Rivaldo/PycharmProjects/Mineracao/Principal.py", line 22, in <module>
print (removestopword(baseTeste))
File "C:/Users/Rivaldo/PycharmProjects/Mineracao/Principal.py", line 17, in removestopword
for(palavras, emocao) in texto:
ValueError: too many values to unpack

Try this:
with open('BasePalavras.txt') as arquivo:
baseTeste = [linha.strip().split(',') for linha in arquivo]
stopwords = ['a', 'agora', 'algum', 'alguma', 'aquele', 'aqueles', 'de', 'deu', 'do', 'e', 'estou', 'esta', 'esta',
'ir', 'meu', 'muito', 'mesmo', 'no', 'nossa', 'o', 'outro', 'para', 'que', 'sem', 'talvez', 'tem', 'tendo',
'tenha', 'teve', 'tive', 'todo', 'um', 'uma', 'umas', 'uns', 'vou']
def removestopword(texto):
frases=[]
for (palavras, emocao) in texto:
semstopwords = [p for p in palavras.split() if p not in stopwords]
frases.append((semstopwords, emocao))
return frases
print (removestopword(baseTeste))
Changed baseTeste = [linha.strip() for linha in arquivo] to baseTeste = [linha.strip().split(',') for linha in arquivo]
and
semstopwords = [p for p in palavras.splits() if p not in stopwords] to semstopwords = [p for p in palavras.split() if p not in stopwords].

Here's how I would do it.
stopwords = ['a', 'agora', 'algum', 'alguma', 'aquele', 'aqueles', 'de', 'deu', 'do', 'e', 'estou', 'esta', 'esta',
'ir', 'meu', 'muito', 'mesmo', 'no', 'nossa', 'o', 'outro', 'para', 'que', 'sem', 'talvez', 'tem', 'tendo',
'tenha', 'teve', 'tive', 'todo', 'um', 'uma', 'umas', 'uns', 'vou']
def remove_stopwords(text):
phrases = []
for (sentence, _) in text:
sentence_without_stopwords = [word for word in sentence.split() if word not in stopwords]
phrases.append(sentence_without_stopwords)
return phrases
with open('input.txt') as raw_text:
sentence_sentiments = []
lines = [line for line in raw_text]
for line in lines:
sentence, sentiment = line.split(',')
sentence_sentiments.append((sentence[1:-1], sentiment[1:-1]))
print(remove_stopwords(sentence_sentiments))
Notice how, in your provided code, baseTeste is an array that contains a list of strings, representing the lines of your input file. This is not what you want, as you're attempting to loop (for(palavras, emocao) in texto:) over the (sentence, sentiment) pairs inside these lines. You are thus missing the middle step of splitting each line into (sentence, sentiment) pairs.

We Keep Coding

Python is a programming language that lets you work quickly and integrate systems more effectively.

Removing accents from keyword strings - python

Related

Program outputs incorrect string format with join method

Output will not become a string from a list

why does this it say in the console Process finished with exit code 0 instead of printing the 'sen' variable? [duplicate]

Replace multiple instances of a sub-string with items in a list

Remove Stopwords in python

Categories

Resources