import os
import random
file = open('getty.txt')
filetext = file.read()
def getline(words,length):
ans=[]
total=0
while (length>total) and 0 != len(words):
word=words.pop(0)
total += len(word)+1 #add 1 for the space
ans.append(word)
#now we are one word too long
if total > length:
words.insert(0,ans.pop())
return ans
def printPara(words,length):
line = []
spaces = []
while len(words) != 0:
line.append(getline(words, length))
for z in range(0,len(line)):
for i in range(0,len(line[z])):
spaces = [[1] * len(line[i]) for i in range (len(line))]
for p in range (0,len(spaces)):
spaces[p][len(spaces[p])-1] = 0
if len(words) + len(spaces) != 0:
addSpace(line,spaces,length)
printLine(line,spaces)
else:
printLine(line,spaces)
def addSpace(line,spaces,length):
totalInt = 0
for i in range (0, len(line)):
totalInt = (len(spaces[i])-2) + len(line[i])
while length < totalInt:
num = random.randint(0, len(spaces) - 2)
spaces[num] += 1
return spaces
def printLine(line, spaces):
for i in range (len(line)):
print(str(line[i]) + (' ' * len(spaces[i])))
def main():
length = 75
textparagraph = filetext.split("\n\n")
para = [0] * len(textparagraph)
for i in range (0, len(textparagraph)):
para[i] = textparagraph[i]
words = [[0] * len(textparagraph) for i in range(len(para))]
for b in range (0,len(para)):
words[b] = para[b].split()
for z in range (0, len(para)):
printPara(words[z],length)
main()
My code outputs only lists of the separate lines and will not concatenate the two lists of words and spaces. How would I get it to output correctly?
Some exampes of output.
['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought', 'forth', 'on', 'this']
['continent,', 'a', 'new', 'nation,', 'conceived', 'in', 'Liberty,', 'and', 'dedicated', 'to', 'the']
['proposition', 'that', 'all', 'men', 'are', 'created', 'equal.']
['Now', 'we', 'are', 'engaged', 'in', 'a', 'great', 'civil', 'war,', 'testing', 'whether', 'that', 'nation,', 'or']
['any', 'nation', 'so', 'conceived', 'and', 'so', 'dedicated,', 'can', 'long', 'endure.', 'We', 'are', 'met', 'on', 'a']
Expected output "Four score and seven years ago..."
You can use
" ".join(["hello", "world"])
and you'll get
"hello world"
This is a word processing code for chabot, in it it removes some articles and prepositions to make it easier for the bot to read
import json
from random import choice
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = input("Você: ")
res = ChatterMessage(k)
.response
print("Bot:", res)
How to remove accents from keyword strings to "make it easier" for chatbot to read? I found this explanation: How to remove string accents using Python 3? But I don't know how it would be applied to this code as the bot always stops responding
You could use the Python package unidecode that replaces special characters with ASCII equivalents.
from unidecode import unidecode
text = "Björn, Łukasz and Σωκράτης."
print(unidecode(text))
# ==> Bjorn, Lukasz and Sokrates.
You could apply this to both the input and keywords.
# In the function definition of reduce(), place this line of code after
# stopwords = ['de', 'a', 'o', .....])
stopwords = [unidecode(s) for s in stopwords]
# In "__main__": replace k = input("Você: ") with the following line of code.
k = unidecode(input("Você: "))
If it makes sense, you could also force the strings to be all lowercase. This will make your string comparisons even more robust.
k = unidecode(input("Você: ").lower())
Because you requested the entire code:
import json
from random import choice
from unidecode import unidecode
class ChatterMessage:
def __init__(self, raw):
self.raw = str(raw).lower()
self.processed_str = self.reduce()
self.responses = self.get_responses()
self.data = self.process_response()
self.response = choice(self.data['response'])
def remove_unwanted_chars(self, string):
list_of_chars = ['?', ".", ",", "!", "#", "[", "]", "{", "}", "#", "$", "%", "*", "&", "(", ")", "-", "_", "+", "="]
new_str = ""
for char in string:
if char not in list_of_chars:
new_str += str(char)
return new_str
def get_responses(self, response_file="info.json"):
with open(response_file, 'r') as file:
return json.loads(file.read())
def reduce(self):
stopwords = ['de', 'a', 'o', 'que', 'e', 'é', 'do', 'da', 'em', 'um', 'para', 'com', 'não', 'uma', 'os', 'no', 'se', 'na', 'por', 'mais', 'as', 'dos', 'como', 'mas', 'ao', 'ele', 'das', 'à', 'seu', 'sua', 'ou', 'quando', 'muito', 'nos', 'já', 'eu', 'também', 'só', 'pelo', 'pela', 'até', 'isso', 'ela', 'entre', 'depois', 'sem', 'mesmo', 'aos', 'seus', 'quem', 'nas', 'me', 'esse', 'eles', 'você', 'essa', 'num', 'nem', 'suas', 'meu', 'às', 'minha', 'numa', 'pelos', 'elas', 'qual', 'nós', 'lhe', 'deles', 'essas', 'esses', 'pelas', 'este', 'dele', 'tu', 'te', 'vocês', 'vos', 'lhes', 'meus', 'minhas', 'teu', 'tua', 'teus', 'tuas', 'nosso', 'nossa', 'nossos', 'nossas', 'dela', 'delas', 'esta', 'estes', 'estas', 'aquele', 'aquela', 'aqueles', 'aquelas', 'isto', 'aquilo', 'estou', 'está', 'estamos', 'estão', 'estive', 'esteve', 'estivemos', 'estiveram', 'estava', 'estávamos', 'estavam', 'estivera', 'estivéramos', 'esteja', 'estejamos', 'estejam', 'estivesse', 'estivéssemos', 'estivessem', 'estiver', 'estivermos', 'estiverem', 'hei', 'há', 'havemos', 'hão', 'houve', 'houvemos', 'houveram', 'houvera', 'houvéramos', 'haja', 'hajamos', 'hajam', 'houvesse', 'houvéssemos', 'houvessem', 'houver', 'houvermos', 'houverem', 'houverei', 'houverá', 'houveremos', 'houverão', 'houveria', 'houveríamos', 'houveriam', 'sou', 'somos', 'são', 'era', 'éramos', 'eram', 'fui', 'foi', 'fomos', 'foram', 'fora', 'fôramos', 'seja', 'sejamos', 'sejam', 'fosse', 'fôssemos', 'fossem', 'for', 'formos', 'forem', 'serei', 'será', 'seremos', 'serão', 'seria', 'seríamos', 'seriam', 'tenho', 'tem', 'temos', 'tém', 'tinha', 'tínhamos', 'tinham', 'tive', 'teve', 'tivemos', 'tiveram', 'tivera', 'tivéramos', 'tenha', 'tenhamos', 'tenham', 'tivesse', 'tivéssemos', 'tivessem', 'tiver', 'tivermos', 'tiverem', 'terei', 'terá', 'teremos', 'terão', 'teria', 'teríamos', 'teriam']
stopwords = [unidecode(s) for s in stopwords]
custom_filter = []
keywords_list = []
strlist = self.raw.split(" ")
for x in strlist:
if x not in stopwords and x not in custom_filter:
keywords_list.append(self.remove_unwanted_chars(x))
return keywords_list
def process_response(self):
percentage = lambda x, y: (100 * y) / x
total = sum(len(x['keywords']) for x in self.responses)
most_acc = 0
response_data = None
acc = 0
for value in self.responses:
c = 0
for x in value['keywords']:
if str(x).lower() in self.processed_str:
c += 1
if c > most_acc:
most_acc = c
acc = percentage(total, most_acc)
print(acc)
response_data = value
if acc < 6:
return {"response": "Sorry, I do not understand. Be more clear please"}
for x in self.processed_str:
if x not in response_data['keywords']:
response_data['keywords'].append(x)
return response_data
if __name__ == '__main__':
while True:
k = unidecode(input("Você: "))
res = ChatterMessage(k).response
print("Bot:", res)
I have a dataframe where every row is a list of tuples , i.e.: tuple = (word, pos_tag). In each row, I want to change the word of some tuples by marking it and then update the tuple with the marked word. For example:
Initial dataframe row :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('example', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('answer', 'NOUN')]
Updated words :
updated_word : <IN>example</IN>
updated_word : <TAR>answer</TAR>
Desired output :
[('This', 'DET'), ('is', 'VERB'), ('an', 'DET'), ('<IN>example</IN>', 'NOUN'), ('text', 'NOUN'), ('that', 'DET'), ('I', 'PRON'), ('use', 'VERB'), ('in', 'ADP'), ('order', 'NOUN'), ('to', 'PART'), ('get', 'VERB'), ('an', 'DET'), ('<TAR>answer</TAR>', 'NOUN')]
But I get an error that TypeError: 'tuple' object is not callable. Can someone help? Here's the code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
for position, tuple in enumerate(pos_tags, start=1):
word = tuple[0]
spacy_pos_tag = tuple[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple = tuple(word, spacy_pos_tag)
print(tuple)
>>>> Traceback (most recent call last):
>>>> tuple = tuple(word, spacy_pos_tag)
>>>> TypeError: 'tuple' object is not callable
Updated question
I have replaced tuple with tuple_ as suggested, but I still can't get back the desired output which is a list of tuples in every row. Can someone help how to update the dataframe rows? Here's the updated code :
for idx, row in df.iterrows():
doc = nlp(row['title'])
pos_tags = [(token.text, token.pos_) for token in doc if not token.pos_ == "PUNCT"]
# print(idx, "tokens, pos : ", pos_tags, "\n")
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
pos_tags.append(' '.join(position, tuple_))
# pos_tags.append(' '.join(tuple_))
print(idx, "tokens, pos : ", pos_tags, "\n")
>>>> Traceback (most recent call last):
>>>> pos_tag(df=df_matched)
>>>> pos_tags.append(' '.join(position, tuple_))
>>>> TypeError: join() takes exactly one argument (2 given)
Do not use tuple as a variable name, as it is a built-in python type name. Try the following instead:
for position, tuple_ in enumerate(pos_tags, start=1):
word = tuple_[0]
spacy_pos_tag = tuple_[1]
word = re.sub(r'[^\w\s]', '', word)
for col in cols:
if position in row[col]:
word = f'<{col.upper()}>{word}</{col.upper()}>'
else:
continue
tuple_ = (word, spacy_pos_tag)
print(tuple_)
Don't use "tuple" as name of a variable. It's a type name
Give an input sentence, that has BIO chunk tags:
[('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed',
'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'),
('swallow', 'I-NP'), ('?', 'O')]
I would need to extract the relevant phrases out, e.g. if I want to extract 'NP', I would need to extract the fragments of tuples that contains B-NP and I-NP.
[out]:
[('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
(Note: the numbers in the extract tuples represent the token index.)
I have tried extracting it using the following code:
def extract_chunks(tagged_sent, chunk_type):
current_chunk = []
current_chunk_position = []
for idx, word_pos in enumerate(tagged_sent):
word, pos = word_pos
if '-'+chunk_type in pos: # Append the word to the current_chunk.
current_chunk.append((word))
current_chunk_position.append((idx))
else:
if current_chunk: # Flush the full chunk when out of an NP.
_chunk_str = ' '.join(current_chunk)
_chunk_pos_str = '-'.join(map(str, current_chunk_position))
yield _chunk_str, _chunk_pos_str
current_chunk = []
current_chunk_position = []
if current_chunk: # Flush the last chunk.
yield ' '.join(current_chunk), '-'.join(current_chunk_position)
tagged_sent = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
But when I have adjacent chunk of the same type:
tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
print (list(extract_chunks(tagged_sent, chunk_type='NP')))
It outputs this:
[('The Mitsubishi Electric Company Managing Director', '0-1-2-3-4-5'), ('ramen', '7')]
Instead of the desired:
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
How can this be resolved from the above code?
Other than how it's done from the code above, is there a better solution to extract the desired chunks of a specific chunk_type?
Try this, it will extract all types of chunks with the indices of their respective words.
def extract_chunks(tagged_sent, chunk_type='NP'):
out_sen = []
for idx, word_pos in enumerate(tagged_sent):
word,bio = word_pos
boundary,tag = bio.split("-") if "-" in bio else ('','O')
if tag != chunk_type:continue
if boundary == "B":
out_sen.append([word, str(idx)])
elif boundary == "I":
out_sen[-1][0] += " "+ word
out_sen[-1][-1] += "-"+ str(idx)
else:
out_sen.append([word, str(idx)])
return out_sen
Demo:
>>> tagged_sent = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'), ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
>>> output_sent = extract_chunks(tagged_sent)
>>> print map(tuple, output_sent)
[('The Mitsubishi Electric Company', '0-1-2-3'), ('Managing Director', '4-5'), ('ramen', '7')]
def extract_chunks(tagged_sent, chunk_type):
grp1, grp2, chunk_type = [], [], "-" + chunk_type
for ind, (s, tp) in enumerate(tagged_sent):
if tp.endswith(chunk_type):
if not tp.startswith("B"):
grp2.append(str(ind))
grp1.append(s)
else:
if grp1:
yield " ".join(grp1), "-".join(grp2)
grp1, grp2 = [s], [str(ind)]
yield " ".join(grp1), "-".join(grp2)
Output:
In [2]: l = [('The', 'B-NP'), ('Mitsubishi', 'I-NP'), ('Electric', 'I-NP'), ('Company', 'I-NP'), ('Managing', 'B-NP'),
...: ('Director', 'I-NP'), ('ate', 'B-VP'), ('ramen', 'B-NP')]
In [3]: list(extract_chunks(l, "NP"))
Out[3]:
[('The Mitsubishi Electric Company', '0-1-2-3'),
('Managing Director', '4-5'),
('ramen', '7')]
In [4]: l = [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')]
In [5]: list(extract_chunks(l, "NP"))
Out[5]: [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')]
I would do it like this:
import re
def extract_chunks(tagged_sent, chunk_type):
# compiles the expression we want to match
regex = re.compile(chunk_type)
# filters matched items in a dictionary whose keys are the matched indexes
first_step = {index_:tag[0] for index_, tag in enumerate(tagged_sent) if regex.findall(tag[1])}
# builds list of lists following output format
second_step = []
for key_ in sorted(first_step.keys()):
if second_step and int(second_step [len(second_step )-1][1].split('-')[-1]) == key_ -1:
second_step[len(second_step)-1][0] += ' {0}'.format(first_step[key_])
second_step[len(second_step)-1][1] += '-{0}'.format(str(key_))
else:
second_step.append([first_step[key_], str(key_)])
# builds output in final format
return [tuple(item) for item in second_step]
You can adapt it to use generators instead of building the whole output in memory like I am doing and refactory it for better performance (I'm in a hurry so the code is far from optimal).
Hope it helps!