Python - Extract Sentences - Off by 1 - python

I want to extract the sentences of a paragraph and print them out line by line. It does a perfect job except when a period is following by a newline. For example: the user finishes a sentence and then hits enter. So there is no space after the period
The code considers this sentence to be part of the previous sentence because there is no space. When it is printed out, they are stuck together. So in other words, how can I alter the code in order to extract a sentence when there is no space after the period. For example: This.should.be.considered.five.sentences because there are five periods but the code considers it as only one sentence.
This is the code:
import re
abbreviations = {'dr.': 'doctor', 'mr.': 'mister', 'bro': 'brother', 'mrs.': 'mistress', 'ms.': 'miss', 'jr.': 'junior', 'sr.': 'senior',
'i.e.': 'for example', 'e.g.': 'for example', 'vs.': 'versus'}
terminators = ['.', '!', '?']
wrappers = ['"', "'", ')', ']', '}']
def find_sentences(paragraph):
end = True
sentences = []
while end > -1:
end = find_sentence_end(paragraph)
if end > -1:
sentences.append(paragraph[end:].strip())
paragraph = paragraph[:end]
sentences.append(paragraph)
sentences.reverse()
return sentences
def find_sentence_end(paragraph):
[possible_endings, contraction_locations] = [[], []]
contractions = abbreviations.keys()
sentence_terminators = terminators + [terminator + wrapper for wrapper in wrappers for terminator in terminators]
for sentence_terminator in sentence_terminators:
t_indices = list(find_all(paragraph, sentence_terminator))
possible_endings.extend(([] if not len(t_indices) else [[i, len(sentence_terminator)] for i in t_indices]))
for contraction in contractions:
c_indices = list(find_all(paragraph, contraction))
contraction_locations.extend(([] if not len(c_indices) else [i + len(contraction) for i in c_indices]))
possible_endings = [pe for pe in possible_endings if pe[0] + pe[1] not in contraction_locations]
if len(paragraph) in [pe[0] + pe[1] for pe in possible_endings]:
max_end_start = max([pe[0] for pe in possible_endings])
possible_endings = [pe for pe in possible_endings if pe[0] != max_end_start]
possible_endings = [pe[0] + pe[1] for pe in possible_endings if sum(pe) > len(paragraph) or (sum(pe) < len(paragraph) and paragraph[sum(pe)] == ' ')]
end = (-1 if not len(possible_endings) else max(possible_endings))
return end
def find_all(a_str, sub):
start = 0
while True:
start = a_str.find(sub, start)
if start == -1:
return
yield start
start += len(sub)

Related

diff list of multiline strings with difflib without knowing which were added, deleted or modified

I have two lists of multiline strings and I try to get the the diff lines for these strings. First I tried to just split all lines of each string and handled all these strings as one big "file" and get the diff for it but I had a lot of bugs. I cannot just diff by index since I do not know, which multiline string was added, which was deleted and which one was modified.
Lets say I had the following example:
import difflib
oldList = ["one\ntwo\nthree","four\nfive\nsix","seven\neight\nnine"]
newList = ["four\nfifty\nsix","seven\neight\nnine","ten\neleven\ntwelve"]
oldAllTogether = []
for string in oldList:
oldAllTogether.extend(string.splitlines())
newAllTogether = []
for string in newList:
newAllTogether.extend(string.splitlines())
diff = difflib.unified_diff(oldAllTogether,newAllTogether)
So I somehow have to find out, which strings belong to each other.
I had to implmenent my own code in order to get the desired output. It is basically the same as Differ.compare() with the difference that we have a look at multiline blocks instead of lines. So the code would be:
diffString = ""
oldList = ["one\ntwo\nthree","four\nfive\nsix","seven\neight\nnine"]
newList = ["four\nfifty\nsix","seven\neight\nnine","ten\neleven\ntwelve"]
a = oldList
b = newList
cruncher = difflib.SequenceMatcher(None, a, b)
for tag, alo, ahi, blo, bhi in cruncher.get_opcodes():
if tag == 'replace':
best_ratio, cutoff = 0.74, 0.75
oldstrings = a[alo:ahi]
newstrings = b[blo:bhi]
for j in range(len(newstrings)):
newstring = newstrings[j]
cruncher.set_seq2(newstring)
for i in range(len(oldstrings)):
oldstring = oldstrings[i]
cruncher.set_seq1(oldstring)
if cruncher.real_quick_ratio() > best_ratio and \
cruncher.quick_ratio() > best_ratio and \
cruncher.ratio() > best_ratio:
best_ratio, best_old, best_new = cruncher.ratio(), i, j
if best_ratio < cutoff:
#added string
stringLines = newstring.splitlines()
for line in stringLines: diffString += "+" + line + "\n"
else:
#replaced string
start = False
for diff in difflib.unified_diff(oldstrings[best_old].splitlines(),newstrings[best_new].splitlines()):
if start:
diffString += diff + "\n"
if diff[0:2] == '##':
start = True
del oldstrings[best_old]
#deleted strings
stringLines = []
for string in oldstrings:
stringLines.extend(string.splitlines())
for line in stringLines: diffString += "-" + line + "\n"
elif tag == 'delete':
stringLines = []
for string in a[alo:ahi]:
stringLines.extend(string.splitlines())
for line in stringLines:
diffString += "-" + line + "\n"
elif tag == 'insert':
stringLines = []
for string in b[blo:bhi]:
stringLines.extend(string.splitlines())
for line in stringLines:
diffString += "+" + line + "\n"
elif tag == 'equal':
continue
else:
raise ValueError('unknown tag %r' % (tag,))
which result in the following:
print(diffString)
four
-five
+fifty
six
-one
-two
-three
+ten
+eleven
+twelve

Line split is not functioning as intended

I am trying to get this code to split one at a time, but it is not functioning as expected:
for line in text_line:
one_line = line.split(' ',1)
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ', ' + meaning
else:
acronyms_dict[acro] = meaning
Remove the ' ' from the str.split. The file is using tabs to delimit the acronyms:
import requests
data_site = requests.get(
"https://raw.githubusercontent.com/priscian/nlp/master/OpenNLP/models/coref/acronyms.txt"
)
text_line = data_site.text.split("\n")
acronyms_dict = {}
for line in text_line:
one_line = line.split(maxsplit=1) # <-- remove the ' '
if len(one_line) > 1:
acro = one_line[0].strip()
meaning = one_line[1].strip()
if acro in acronyms_dict:
acronyms_dict[acro] = acronyms_dict[acro] + ", " + meaning
else:
acronyms_dict[acro] = meaning
print(acronyms_dict)
Prints:
{
'24KHGE': '24 Karat Heavy Gold Electroplate',
'2B1Q': '2 Binary 1 Quaternary',
'2D': '2-Dimensional',
...

How would I format python code using python?

Let's say I've got this code in python:
total=0for i in range(100):print(i)if i > 50:total=total+i
How would I make an algorithm in python to format this python code into the code below:
total=0
for i in range(100):
print(i)
if i > 50:
total=total+i
Assume that everything is nested under each other, such that another statement would be assumed to be inside the if block.
This was quite a fun exercise! I'm running out of juice so just posting this as is. It works on your example but probably not much for anything more complex.
code_block = "total=0for i in range(100):print(i)if i > 50:total=total+iprint('finished')"
code_block_b = "def okay() {print('ff')while True:print('blbl')break}"
line_break_before = ['for', 'while', 'if', 'print', 'break', '}']
line_break_after = [':', '{']
indent_chars = [':', '{']
unindent_chars = ['}']
# Add line breaks before keywords
for kw in line_break_before:
kw_indexes = [idx for idx in range(len(code_block)) if code_block[idx:idx + len(kw)] == kw]
for kw_idx in kw_indexes[::-1]:
code_block = code_block[:kw_idx] + '\n' + code_block[kw_idx:]
# Add line breaks after other keywords if not present already
for kw in line_break_after:
kw_indexes = [idx for idx in range(len(code_block)) if code_block[idx:idx + len(kw)] == kw]
for kw_idx in kw_indexes[::-1]:
if code_block[kw_idx + 1: kw_idx + 2] != '\n':
code_block = code_block[:kw_idx + 1] + '\n' + code_block[kw_idx + 1:]
# Add indentation
indent = 0
formatted_code_lines = []
for line in code_block.split('\n'):
if line[-1] in unindent_chars:
indent = 0
formatted_code_lines.append(' ' * indent)
if line[-1] in indent_chars:
indent += 4
formatted_code_lines.append(line + '\n')
code_block = ''.join(formatted_code_lines)
print(code_block)
The basic premise for formatting is based around keywords. There are keys that require a line break before, and keys that require a line break after them. After that, the indentation was counted +4 spaces for every line after each : symbol. I tested some formatting with braces too in code_block_b.
Output a
total=0
for i in range(100):
print(i)
if i > 50:
total=total+i
Output b
def okay() {
print('ff')
while True:
print('blbl')
break
}

How to add a space in hangman game

I'm trying to make a game, where a song name is picked from a file, and the title is replaced with underscores (apart from the first letter)
However I'm not sure how to add a space into it, as some songs are more than one word, this is what I have currently:
def QuizStart():
line = random.choice(open('songnamefile.txt').readlines())
line.split('-')
songname, artist = line.split('-')
underscoresong = songname
i=0
song_name = range(1,len(songname))
for i in song_name:
if ' ' in song_name:
i=i+1
else:
underscoresong = underscoresong.replace(songname[i],"_")
i=i+1
print(underscoresong, ' - ', artist)
It would be good to include expected output for a given input examples.
You can just multiply an array containing the placeholder character n times. e.g.:
songname = 'My blue submarine'
underscoresong = ''.join([songname[0]] + ['_'] * (len(songname) - 1))
print(underscoresong)
Output:
M________________
That will add the first character and then the underscore for as long as the songname is, minus one (for the first character). The join converts it to a string.
Or if you want to preserve spaces:
underscoresong = ''.join(
[songname[0]] + ['_' if c != ' ' else ' ' for c in songname[1:]]
)
print(underscoresong)
Output:
M_ ____ _________
Or if you want to also preserve the single quote:
songname = "God's Plan-Drake"
underscoresong = ''.join(
[songname[0]] +
['_' if c not in {' ', "'"} else c for c in songname[1:]]
)
print(underscoresong)
Output:
G__'_ __________
You could also use regular expressions:
import re
songname = "God's Plan-Drake"
underscoresong = songname[0] + re.sub(r"[^ ']", '_', songname[1:])
print(underscoresong)
Output:
G__'_ __________

TF-IDF for my documents yield 0

I got this tfidf from yebrahim and somehow my output document yield all 0 for the result . Any problem with this ?
example of the output is
hippo 0.0
hipper 0.0
hip 0.0
hint 0.0
hindsight 0.0
hill 0.0
hilarious 0.0
thanks for the help
# a list of (words-freq) pairs for each document
global_terms_in_doc = {}
# list to hold occurrences of terms across documents
global_term_freq = {}
num_docs = 0
lang = 'english'
lang_dictionary = {}
top_k = -1
supported_langs = ('english', 'french')
from django.utils.encoding import smart_str, smart_unicode
# support for custom language if needed
def loadLanguageLemmas(filePath):
print('loading language from file: ' + filePath)
f = open(filePath)
for line in f:
words = line.split()
if words[1] == '=' or words[0] == words[1]:
continue
lang_dictionary[words[0]] = words[1]
def remove_diacritic(words):
for i in range(len(words)):
w = unicode(words[i], 'ISO-8859-1')
w = unicodedata.normalize('NFKD', w).encode('ASCII', 'ignore')
words[i] = w.lower()
return words
# function to tokenize text, and put words back to their roots
def tokenize(text):
text = ' '.join(text)
tokens = PunktWordTokenizer().tokenize(text)
# lemmatize words. try both noun and verb lemmatizations
lmtzr = WordNetLemmatizer()
for i in range(0,len(tokens)):
#tokens[i] = tokens[i].strip("'")
if lang != 'english':
if tokens[i] in lang_dictionary:
tokens[i] = lang_dictionary[tokens[i]]
else:
res = lmtzr.lemmatize(tokens[i])
if res == tokens[i]:
tokens[i] = lmtzr.lemmatize(tokens[i], 'v')
else:
tokens[i] = res
# don't return any single letters
tokens = [t for t in tokens if len(t) > 1 and not t.isdigit()]
return tokens
def remove_stopwords(text):
# remove punctuation
chars = ['.', '/', "'", '"', '?', '!', '#', '$', '%', '^', '&',
'*', '(', ')', ' - ', '_', '+' ,'=', '#', ':', '\\', ',',
';', '~', '`', '<', '>', '|', '[', ']', '{', '}']
for c in chars:
text = smart_str(text.replace(c, ' '))
text = text.split()
import nltk
if lang == 'english':
stopwords = nltk.corpus.stopwords.words('english')
else:
stopwords = open(lang + '_stopwords.txt', 'r').read().split()
content = [w for w in text if w.lower().strip() not in stopwords]
return content
# __main__ execution
import sys, re, math, unicodedata
from optparse import OptionParser
parser = OptionParser(usage='usage: %prog [options] input_file')
parser.add_option('-l', '--language', dest='language',
help='language to use in tokenizing and lemmatizing. supported\
languages: {english, french}', metavar='LANGUAGE')
parser.add_option('-k', '--top-k', dest='top_k',
help='output only terms with score no less k')
parser.add_option('-m', '--mode', dest='mode',
help='display mode. can be either "both" or "term"')
(options, args) = parser.parse_args()
if options.language:
if options.language not in supported_langs:
print 'only ', supported_langs, ' are supported in this version.'
quit()
if options.language != 'english':
lang = options.language
loadLanguageLemmas(options.language + '_lemmas.txt')
if options.top_k:
top_k = int(options.top_k)
display_mode = 'both'
if options.mode:
if options.mode == 'both' or options.mode == 'term':
display_mode = options.mode
else:
parser.print_help()
if not args:
parser.print_help()
quit()
reader = open(args[0])
all_files = reader.read().splitlines()
num_docs = len(all_files)
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.punkt import PunktWordTokenizer
print('initializing..')
for f in all_files:
# local term frequency map
terms_in_doc = {}
doc_words = open(f).read().lower()
#print 'words:\n', doc_words
doc_words = remove_stopwords(doc_words)
#print 'after stopwords:\n', doc_words
doc_words = tokenize(doc_words)
#print 'after tokenize:\n', doc_words
#quit()
# increment local count
for word in doc_words:
if word in terms_in_doc:
terms_in_doc[word] += 1
else:
terms_in_doc[word] = 1
# increment global frequency
for (word,freq) in terms_in_doc.items():
if word in global_term_freq:
global_term_freq[word] += 1
else:
global_term_freq[word] = 1
global_terms_in_doc[f] = terms_in_doc
print('working through documents.. ')
for f in all_files:
writer = open(f + '_final', 'w')
result = []
# iterate over terms in f, calculate their tf-idf, put in new list
max_freq = 0;
for (term,freq) in global_terms_in_doc[f].items():
if freq > max_freq:
max_freq = freq
for (term,freq) in global_terms_in_doc[f].items():
idf = math.log(float(1 + num_docs) / float(1 + global_term_freq[term]))
tfidf = float(freq) / float(max_freq) * float(idf)
result.append([tfidf, term])
# sort result on tfidf and write them in descending order
result = sorted(result, reverse=True)
for (tfidf, term) in result[:top_k]:
if display_mode == 'both':
writer.write(term + '\t' + str(tfidf) + '\n')
else:
writer.write(term + '\n')
print('success, with ' + str(num_docs) + ' documents.')

Categories