Python find offsets of a word token in a text - python

I wrote this function findTokenOffset that finds the offset of a given word in a pre-tokenized text (as a list of spaced words or according to a certain tokenizer).
import re, json
def word_regex_ascii(word):
return r"\b{}\b".format(re.escape(word))
def findTokenOffset(text,tokens):
seen = {} # map if a token has been see already!
items=[] # word tokens
my_regex = word_regex_ascii
# for each token word
for index_word,word in enumerate(tokens):
r = re.compile(my_regex(word), flags=re.I | re.X | re.UNICODE)
item = {}
# for each matched token in sentence
for m in r.finditer(text):
token=m.group()
characterOffsetBegin=m.start()
characterOffsetEnd=characterOffsetBegin+len(m.group()) - 1 # LP: star from 0
found=-1
if word in seen:
found=seen[word]
if characterOffsetBegin > found:
# store last word has been seen
seen[word] = characterOffsetEnd
item['index']=index_word+1 #// word index starts from 1
item['word']=token
item['characterOffsetBegin'] = characterOffsetBegin
item['characterOffsetEnd'] = characterOffsetEnd
items.append(item)
break
return items
This code works ok when the tokens are single words like
text = "George Washington came to Washington"
tokens = text.split()
offsets = findTokenOffset(text,tokens)
print(json.dumps(offsets, indent=2))
But, supposed to have tokens having a multi-token fashion like here:
text = "George Washington came to Washington"
tokens = ["George Washington", "Washington"]
offsets = findTokenOffset(text,tokens)
print(json.dumps(offsets, indent=2))
the offset does not work properly, due to repeating words in different tokens:
[
{
"index": 1,
"word": "George Washington",
"characterOffsetBegin": 0,
"characterOffsetEnd": 16
},
{
"index": 2,
"word": "Washington",
"characterOffsetBegin": 7,
"characterOffsetEnd": 16
}
]
How to add support to multi-token and overlapped token regex matching (thanks to the suggestion in comments for this exact problem's name)?

If you do not need the search phrase/word index information in the resulting output, you can use the following approach:
import re,json
def findTokenOffset(text, pattern):
items = []
for m in pattern.finditer(text):
item = {}
item['word']=m.group()
item['characterOffsetBegin'] = m.start()
item['characterOffsetEnd'] = m.end()
items.append(item)
return items
text = "George Washington came to Washington Washington.com"
tokens = ["George Washington", "Washington"]
pattern = re.compile(fr'(?<!\w)(?:{"|".join(sorted(map(re.escape, tokens), key=len, reverse=True))})(?!\w)(?!\.\b)', re.I )
offsets = findTokenOffset(text,pattern)
print(json.dumps(offsets, indent=2))
The output of the Python demo:
[
{
"word": "George Washington",
"characterOffsetBegin": 0,
"characterOffsetEnd": 17
},
{
"word": "Washington",
"characterOffsetBegin": 26,
"characterOffsetEnd": 36
}
]
The main part is pattern = re.compile(fr'(?<!\w)(?:{"|".join(sorted(map(re.escape, tokens), key=len, reverse=True))})\b(?!\.\b)', re.I ) that does the following:
map(re.escape, tokens) - escapes special chars inside tokens strings
sorted(..., key=len, reverse=True) - sorts the items in escaped tokens by length in a descending order (so that Washigton Post could match earlier than Washington)
"|".join(...) - created an alternation list of tokens, token1|token2|etc
(?<!\w)(?:...)(?!\w)(?!\.\b) - is the final pattern that matches all the alternatives in tokens as whole words. (?<!\w) and (?!\w) are used to enable word boundary detection even if the tokens start/end with a special character.
NOTE ON WORD BOUNDARIES
You should check your token boundary requirements. I added (?!\.\b) as you mention that Washington should not match in Washington.com, so I inferred to want to fail any word match when it is immediately followed with . and a word boundary. There are a lot of other possible solutions, the main one being whitespace boundaries, (?<!\S) and (?!\S).
Besides, see Match a whole word in a string using dynamic regex.

If you want to lookup for Washington, but not George Washington, you can remove the sentences you found from initial string. So, you can sort the 'tokens' by the word quantity. That gives you an opportunity to firstly scan the senteces, and after that, the words.

Related

Unexpected behavior of SpaCy matcher with negation

Somehow I have trouble understanding the negation in SpaCy matchers. I tried this code:
import spacy
from spacy.matcher import Matcher
import json
nlp = spacy.load('en_core_web_sm')
#from spacy.tokenizer import Tokenizer
matcher = Matcher(nlp.vocab)
Sentence = "The cat is black"
negative_sentence = "The cat is not black"
test_pattern = '''
[
[
{
"TEXT": "cat"
},
{
"LEMMA": "be"
},
{
"LOWER": "not",
"OP": "!"
},
{
"LOWER": "black"
}
]
]
'''
db = json.loads(test_pattern)
matcher.add("TEST_PATTERNS", db)
'''*********************Validate matcher on positive sentence******************'''
doc = nlp(Sentence, matcher)
matches = matcher(doc)
if matches != []:
print('Positive sentence identified')
else:
print('Nothing found for positive sentence')
'''*********************Validate matcher on negative sentence******************'''
doc = nlp(negative_sentence, matcher)
matches = matcher(doc)
if matches != []:
print('Negative sentence identified')
else:
print('Nothing found for negative sentence')
The result is:
Nothing found for positive sentence
Nothing found for negative sentence
I would expect that the sentence "The cat is black" would be a match. Furthermore, when I replace the ! with any other sign ("*", "?", or "+") it works as expected:
import spacy
from spacy.matcher import Matcher
import json
nlp = spacy.load('en_core_web_sm')
#from spacy.tokenizer import Tokenizer
matcher = Matcher(nlp.vocab)
Sentence = "The cat is black"
negative_sentence = "The cat is not black"
test_pattern = '''
[
[
{
"TEXT": "cat"
},
{
"LEMMA": "be"
},
{
"LOWER": "not",
"OP": "?"
},
{
"LOWER": "black"
}
]
]
'''
db = json.loads(test_pattern)
matcher.add("TEST_PATTERNS", db)
'''*********************Validate matcher on positive sentence******************'''
doc = nlp(Sentence, matcher)
matches = matcher(doc)
if matches != []:
print('Positive sentence identified')
else:
print('Nothing found for positive sentence')
'''*********************Validate matcher on negative sentence******************'''
doc = nlp(negative_sentence, matcher)
matches = matcher(doc)
if matches != []:
print('Negative sentence identified')
else:
print('Nothing found for negative sentence')
Result:
Positive sentence identified
Negative sentence identified
How can I use the negation and only identify "The cat is black" and not "The cat is not black".
The reason why like to of the "OP" is because there might also other words between "is" and "black" (e.g., "The cat is kind and black" and not "The cat is not kind and black" ).
Any help on understanding negation with SpaCy matchers is highly appreciated.
Each dictionary in your match pattern corresponds to a token by default. With the ! operator it still corresponds to one token, just in a negative sense. With the * operator it corresponds to zero or more tokens, with + it's one or more tokens.
Looking at your original pattern, these are your tokens:
text: cat
lemma: be
text: not, op: !
lower: cat
Given the sentence "The cat is black", the match process works like this:
"the" matches nothing so we skip it.
"cat" matches your first token.
"is" matches your second token.
"black" matches your third token because it is not "not"
The sentence ends so there is no "cat" token, so the match fails.
When debugging patterns it's helpful to step through them like above.
For the other ops... * and ? work because "not" matches zero times. I would not expect + to work in the positive case.
The way you are trying to avoid matching negated things is kind of tricky. I would recommend you match all sentences with the relevant words first, ignoring negation, and then check if there is negation using the dependency parse.

Using .replace effectively on text

I'm attempting to capitalize all words in a section of text that only appear once. I have the bit that finds which words only appear once down, but when I go to replace the original word with the .upper version, a bunch of other stuff gets capitalized too. It's a small program, so here's the code.
from collections import Counter
from string import punctuation
path = input("Path to file: ")
with open(path) as f:
word_counts = Counter(word.strip(punctuation) for line in f for word in line.replace(")", " ").replace("(", " ")
.replace(":", " ").replace("", " ").split())
wordlist = open(path).read().replace("\n", " ").replace(")", " ").replace("(", " ").replace("", " ")
unique = [word for word, count in word_counts.items() if count == 1]
for word in unique:
print(word)
wordlist = wordlist.replace(word, str(word.upper()))
print(wordlist)
The output should be 'Genesis 37:1 Jacob lived in the land of his father's SOJOURNINGS, in the land of Canaan., as sojournings is the first word that only appears once. Instead, it outputs GenesIs 37:1 Jacob lIved In the land of hIs FATher's SOJOURNINGS, In the land of Canaan. Because some of the other letters appear in keywords, it tries to capitalize them as well.
Any ideas?
I rewrote the code pretty significantly since some of the chained replace calls might prove to be unreliable.
import string
# The sentence.
sentence = "Genesis 37:1 Jacob lived in the land of his father's SOJOURNINGS, in the land of Canaan."
rm_punc = sentence.translate(None, string.punctuation) # remove punctuation
words = rm_punc.split(' ') # split spaces to get a list of words
# Find all unique word occurrences.
single_occurrences = []
for word in words:
# if word only occurs 1 time, append it to the list
if words.count(word) == 1:
single_occurrences.append(word)
# For each unique word, find it's index and capitalize the letter at that index
# in the initial string (the letter at that index is also the first letter of
# the word). Note that strings are immutable, so we are actually creating a new
# string on each iteration. Also, sometimes small words occur inside of other
# words, e.g. 'an' inside of 'land'. In order to make sure that our call to
# `index()` doesn't find these small words, we keep track of `start` which
# makes sure we only ever search from the end of the previously found word.
start = 0
for word in single_occurrences:
try:
word_idx = start + sentence[start:].index(word)
except ValueError:
# Could not find word in sentence. Skip it.
pass
else:
# Update counter.
start = word_idx + len(word)
# Rebuild sentence with capitalization.
first_letter = sentence[word_idx].upper()
sentence = sentence[:word_idx] + first_letter + sentence[word_idx+1:]
print(sentence)
Text replacement by patters calls for regex.
Your text is a bit tricky, you have to
remove digits
remove punktuations
split into words
care about capitalisation: 'It's' vs 'it's'
only replace full matches 'remote' vs 'mote' when replacing mote
etc.
This should do this - see comments inside for explanations:
bible.txt is from your link
from collections import Counter
from string import punctuation , digits
import re
from collections import defaultdict
with open(r"SO\AllThingsPython\P4\bible.txt") as f:
s = f.read()
# get a set of unwanted characters and clean the text
ps = set(punctuation + digits)
s2 = ''.join( c for c in s if c not in ps)
# split into words
s3 = s2.split()
# create a set of all capitalizations of each word
repl = defaultdict(set)
for word in s3:
repl[word.upper()].add(word) # f.e. {..., 'IN': {'In', 'in'}, 'THE': {'The', 'the'}, ...}
# count all words _upper case_ and use those that only occure once
single_occurence_upper_words = [w for w,n in Counter( (w.upper() for w in s3) ).most_common() if n == 1]
text = s
# now the replace part - for all upper single words
for upp in single_occurence_upper_words:
# for all occuring capitalizations in the text
for orig in repl[upp]:
# use regex replace to find the original word from our repl dict with
# space/punktuation before/after it and replace it with the uppercase word
text = re.sub(f"(?<=[{punctuation} ])({orig})(?=[{punctuation} ])",upp, text)
print(text)
Output (shortened):
Genesis 37:1 Jacob lived in the land of his father's SOJOURNINGS, in the land of Canaan.
2 These are the GENERATIONS of Jacob.
Joseph, being seventeen years old, was pasturing the flock with his brothers. He was a boy with the sons of Bilhah and Zilpah, his father's wives. And Joseph brought a BAD report of them to their father. 3 Now Israel loved Joseph more than any other of his sons, because he was the son of his old age. And he made him a robe of many colors. [a] 4 But when his brothers saw that their father loved him more than all his brothers, they hated him
and could not speak PEACEFULLY to him.
<snipp>
The regex uses lookahead '(?=...)' and lookbehind '(?<=...)'syntax to make sure we replace only full words, see regex syntax.

Python RE. excluding some results

I'm new to RE and I'm trying to take song lyrics and isolate the verse titles, the backing vocals, and main vocals:
Here's an example of some lyrics:
[Intro]
D.A. got that dope!
[Chorus: Travis Scott]
Ice water, turned Atlantic (Freeze)
Nightcrawlin' in the Phantom (Skrrt, Skrrt)...
The verse titles include the square brackets and any words between them. They can be successfully isolated with
r'\[{1}.*?\]{1}'
The backing vocals are similar to the verse titles, but between (). They've been successfully isolated with:
r'\({1}.*?\){1}'
For the main vocals, I've used
r'\S+'
which does isolate the main_vocals, but also the verse titles and backing vocals. I cannot figure out how to isolate only the main vocals with simple REs.
Here's a python script that gets the output I desire, but I'd like to do it with REs (as a learning exercise) and cannot figure it out through documentation.
import re
file = 'D:/lyrics.txt'
with open(file, 'r') as f:
lyrics = f.read()
def find_spans(pattern, string):
pattern = re.compile(pattern)
return [match.span() for match in pattern.finditer(string)]
verses = find_spans(r'\[{1}.*?\]{1}', lyrics)
backing_vocals = find_spans(r'\({1}.*?\){1}', lyrics)
main_vocals = find_spans(r'\S+', lyrics)
exclude = verses
exclude.extend(backing_vocals)
not_main_vocals = []
for span in exclude:
start, stop = span
not_main_vocals.extend(list(range(start, stop)))
main_vocals_temp = []
for span in main_vocals:
append = True
start, stop = span
for i in range(start, stop):
if i in not_main_vocals:
append = False
continue
if append == True:
main_vocals_temp.append(span)
main_vocals = main_vocals_temp
Try this Demo:
pattern = r'(?P<Verse>\[[^\]]+])|(?P<Backing>\([^\)]+\))|(?P<Lyrics>[^\[\(]+)'
You can use re.finditer to isolate the groups.
breakdown = {k: [] for k in ('Verse', 'Backing', 'Lyrics')}
for p in pattern.finditer(song):
for key, item in p.groupdict().items():
if item: breakdown[key].append(item)
Result:
{
'Verse':
[
'[Intro]',
'[Chorus: Travis Scott]'
],
'Backing':
[
'(Freeze)',
'(Skrrt, Skrrt)'
],
'Lyrics':
[
'\nD.A. got that dope!\n\n',
'\nIce water, turned Atlantic ',
"\nNightcrawlin' in the Phantom ",
'...'
]
}
To elaborate a bit further on the pattern, it's using the named groups to separate the three distinct groups. Using [^\]+] and similar just means to find everything that is not ] (and likewise when \) means everything not )). In the Lyrics part we exclude anything that starts with [ and (. The link to the demo on regex101 would explain the components in more details if you need.
If you don't care for the newlines in the main lyrics, use (?P<Lyrics>[^\[\(\n]+) (which excludes the \n) to turn your Lyrics without newlines:
'Lyrics': [
'D.A. got that dope!',
'Ice water, turned Atlantic ',
"Nightcrawlin' in the Phantom ",
'...'
]
You could search for the text between close-brackets and open-brackets, using regex groups. If you have a single group (sub-pattern inside round-brackets) in your regex, re.findall will just return the contents of those brackets.
For example, "\[(.*?)\]" would find you just the section labels, not including the square brackets (since they're outside the group).
The regex "\)(.*?)\(" would find just the last line ("\nNightcrawlin' in the Phantom ").
Similarly, we could find the first line with "\](.*?)\[".
Combining the two types of brackets into a character class, the (significantly messier looking) regex "[\]\)](.*?)[\[\(]" captures all of the lyrics.
It will miss lines that don't have brackets before or after them (ie. a the very start before [Intro] if there are any, or at the end if there are no backing vocals afterwards). A possible workaround is to prepend a "]" character and append a "[" character to the end to force a match to start/end at the end of the string. Note we need to add the DOTALL option to make sure the wildcard "." will match the newline character "\n"
import re
lyrics = """[Intro]
D.A. got that dope!
[Chorus: Travis Scott]
Ice water, turned Atlantic (Freeze)
Nightcrawlin' in the Phantom (Skrrt, Skrrt)..."""
matches = re.findall(r"[\]\)](.*?)[\[\(]", "]" + lyrics + "[", re.DOTALL)
main_vocals = '\n'.join(matches)

How to add a if condition in re.sub in python

I am using the following code to replace the strings in words with words[0] in the given sentences.
import re
sentences = ['industrial text minings', 'i love advanced data minings and text mining']
words = ["data mining", "advanced data mining", "data minings", "text mining"]
start_terms = sorted(words, key=lambda x: len(x), reverse=True)
start_re = "|".join(re.escape(item) for item in start_terms)
results = []
for sentence in sentences:
for terms in words:
if terms in sentence:
result = re.sub(start_re, words[0], sentence)
results.append(result)
break
print(results)
My expected output is as follows:
[industrial text minings', 'i love data mining and data mining]
However, what I am getting is:
[industrial data minings', 'i love data mining and data mining]
In the first sentence text minings is not in words. However, it contains "text mining" in the words list, so the condition "text mining" in "industrial text minings" becomes True. Then post replacement, it "text mining" becomes "data mining", with the 's' character staying at the same place. I want to avoid such situations.
Therefore, I am wondering if there is a way to use if condition in re.sub to see if the next character is a space or not. If a space, do the replacement, else do not do it.
I am also happy with other solutions that could resolve my issue.
I modifed your code a bit:
# Using Python 3.6.1
import re
sentences = ['industrial text minings and data minings and data', 'i love advanced data mining and text mining as data mining has become a trend']
words = ["data mining", "advanced data mining", "data minings", "text mining", "data", 'text']
# Sort by length
start_terms = sorted(words, key=len, reverse=True)
results = []
# Loop through sentences
for sentence in sentences:
# Loop through sorted words to replace
result = sentence
for term in start_terms:
# Use exact word matching
exact_regex = r'\b' + re.escape(term) + r'\b'
# Replace matches with blank space (to avoid priority conflicts)
result = re.sub(exact_regex, " ", result)
# Replace inserted blank spaces with "data mining"
blank_regex = r'^\s(?=\s)|(?<=\s)\s$|(?<=\s)\s(?=\s)'
result = re.sub(blank_regex, words[0] , result)
results.append(result)
# Print sentences
print(results)
Output:
['industrial data mining minings and data mining and data mining', 'i love data mining and data mining as data mining has become a trend']
The regex can be a bit confusing so here's a quick breakdown:
\bword\b matches exact phrases/words since \b is a word boundary (more on that here)
^\s(?=\s) matches a space at the beginning followed by another space.
(?<=\s)\s$ matches a space at the end preceded by another space.
(?<=\s)\s(?=\s) matches a space with a space on both sides.
For more info on positive look behinds (?<=...) and positive look aheads (?=...) see this Regex tutorial.
You can use a word boundary \b to surround your whole regex:
start_re = "\\b(?:" + "|".join(re.escape(item) for item in start_terms) + ")\\b"
Your regex will become something like:
\b(?:data mining|advanced data mining|data minings|text mining)\b
(?:) denotes a non-capturing group.

python word grouping based on words before and after

I am trying create groups of words. First I am counting all words. Then I establish the top 10 words by word count. Then I want to create 10 groups of words based on those top 10. Each group consist of all the words that are before and after the top word.
I have survey results stored in a python pandas dataframe structured like this
Question_ID | Customer_ID | Answer
1 234 Data is very important to use because ...
2 234 We value data since we need it ...
I also saved the answers column as a string.
I am using the following code to find 3 words before and after a word ( I actually had to create a string out of the answers column)
answers_str = df.Answer.apply(str)
for value in answers_str:
non_data = re.split('data|Data', value)
terms_list = [term for term in non_data if len(term) > 0] # skip empty terms
substrs = [term.split()[0:3] for term in terms_list] # slice and grab first three terms
result = [' '.join(term) for term in substrs] # combine the terms back into substrings
print result
I have been manually creating groups of words - but is there a way of doing it in python?
So based on the example shown above the group with word counts would look like this:
group "data":
data : 2
important: 1
value: 1
need:1
then when it goes through the whole file, there would be another group:
group "analytics:
analyze: 5
report: 7
list: 10
visualize: 16
The idea would be to get rid of "we", "to","is" as well - but I can do it manually, if that's not possible.
Then to establish the 10 most used words (by word count) and then create 10 groups with words that are in front and behind those main top 10 words.
We can use regex for this. We'll be using this regular expression
((?:\b\w+?\b\s*){0,3})[dD]ata((?:\s*\b\w+?\b){0,3})
which you can test for yourself here, to extract the three words before and after each occurence of data
First, let's remove all the words we don't like from the strings.
import re
# If you're processing a lot of sentences, it's probably wise to preprocess
#the pattern, assuming that bad_words is the same for all sentences
def remove_words(sentence, bad_words):
pat = r'(?:{})'.format(r'|'.join(bad_words))
return re.sub(pat, '', sentence, flags=re.IGNORECASE)
The we want to get the words that surround data in each line
data_pat = r'((?:\b\w+?\b\s*){0,3})[dD]ata((?:\s*\b\w+?\b){0,3})'
res = re.findall(pat, s, flags=re.IGNORECASE)
gives us a list of tuples of strings. We want to get a list of those strings after they are split.
from itertools import chain
list_of_words = list(chain.from_iterable(map(str.split, chain.from_iterable(map(chain, chain(res))))))
That's not pretty, but it works. Basically, we pull the tuples out of the list, pull the strings out of each tuples, then split each string then pull all the strings out of the lists they end up in into one big list.
Let's put this all together with your pandas code. pandas isn't my strongest area, so please don't assume that I haven't made some elementary mistake if you see something weird looking.
import re
from itertools import chain
from collections import Counter
def remove_words(sentence, bad_words):
pat = r'(?:{})'.format(r'|'.join(bad_words))
return re.sub(pat, '', sentence, flags=re.IGNORECASE)
bad_words = ['we', 'is', 'to']
sentence_list = df.Answer.apply(lambda x: remove_words(str(x), bad_words))
c = Counter()
data_pat = r'((?:\b\w+?\b\s*){0,3})data((?:\s*\b\w+?\b){0,3})'
for sentence in sentence_list:
res = re.findall(data_pat, sentence, flags=re.IGNORECASE)
words = chain.from_iterable(map(str.split, chain.from_iterable(map(chain, chain(res)))))
c.update(words)
The nice thing about the regex we're using is that all of the complicated parts don't care about what word we're using. With a slight change, we can make a format string
base_pat = r'((?:\b\w+?\b\s*){{0,3}}){}((?:\s*\b\w+?\b){{0,3}})'
such that
base_pat.format('data') == data_pat
So with some list of words we want to collect information about key_words
import re
from itertools import chain
from collections import Counter
def remove_words(sentence, bad_words):
pat = r'(?:{})'.format(r'|'.join(bad_words))
return re.sub(pat, '', sentence, flags=re.IGNORECASE)
bad_words = ['we', 'is', 'to']
sentence_list = df.Answer.apply(lambda x: remove_words(str(x), bad_words))
key_words = ['data', 'analytics']
d = {}
base_pat = r'((?:\b\w+?\b\s*){{0,3}}){}((?:\s*\b\w+?\b){{0,3}})'
for keyword in key_words:
key_pat = base_pat.format(keyword)
c = Counter()
for sentence in sentence_list:
res = re.findall(key_pat, sentence, flags=re.IGNORECASE)
words = chain.from_iterable(map(str.split, chain.from_iterable(map(chain, chain(res)))))
c.update(words)
d[keyword] = c
Now we have a dictionary d that maps keywords, like data and analytics to Counters that map words that are not on our blacklist to their counts in the vicinity of the associated keyword. Something like
d= {'data' : Counter({ 'important' : 2,
'very' : 3}),
'analytics' : Counter({ 'boring' : 5,
'sleep' : 3})
}
As to how we get the top 10 words, that's basically the thing Counter is best at.
key_words, _ = zip(*Counter(w for sentence in sentence_list for w in sentence.split()).most_common(10))

Categories