Get the word around a given position - python

Let
s = 'hello you blablablbalba qyosud'
i = 17
How to get the word around position i? i.e. blablablbalba in my example.
I was thinking about this, but it seems unpythonic:
for j, c in enumerate(s):
if c == ' ':
if j < i:
start = j
else:
end = j
break
print start, end
print s[start+1:end]

Here is another simple approach with regex,
import re
s = 'hello you blablablbalba qyosud'
i = 17
string_at_i = re.findall(r"(\w+)", s[i:])[0]
print(re.findall(r"\w*%s\w*" % string_at_i, s))
Updated : Previous pattern was failing when there is space. Current pattern takes care of it !

To answer your first question,
p = s[0 : i].rfind(' ')
Output: 9
For your second question,
s[ p + 1 : (s[p + 1 : ].find(' ') + p + 1) ]
Output: 'blablablbalba'
Description:
Extract the string from the starting to the ith position.
Find the index of the last occurrence of space. This will be your starting point for your required word (the second question).
Go from here to the next occurrence of space and extract the word in between.
The following consolidated code should work in all scenarios:
s = s + ' '
p = s[0 : i].rfind(' ')
s[ p + 1 : (s[p + 1 : ].find(' ') + p + 1) ]

You can split the word by space, after that you count the number of the spaces until the threshold parameter (i) and this would be the index of the item in the splitted list.
Solution:
print (s.split()[s[:i].count(" ")])
EDIT:
If we have more than one space between words and we want to consider two spaces (or more) as one space we can do:
print (s.split()[" ".join(s[:i].split()).count(" ")])
Output:
blablablbalba
Explanation:
This return's 2 as there are two spaces until the 17 index.
s[:i].count(" ") # return's 2
This return's a list splitted by space.
s.split()
What you need is the index of the relevant item, which you got from s[:i].count(" ")
['hello', 'you', 'blablablbalba', 'qyosud']

def func(s, i):
s1 = s[0:i]
k = s1.rfind(' ')
pos1 = k
s1 = s[k+1:len(s)]
k = s1.find(' ')
k = s[pos1+1:pos1+k+1]
return k
s = 'hello you blablablbalba qyosud'
i = 17
k = func(s, i)
print(k)
output:
blablablbalba

You can use index or find to get the index of the space starting from a precise position. In this case it will look for the space character position starting from start+1. Then, if it finds any space it will print out the word between the two indexes start and end
s = 'hello you blablablbalba qyosud'
def get_word(my_string, start_index):
end = -1
try:
end = s.find(' ', start_index + 1)
except ValueError:
# no second space was found
pass
return s[start_index:end] if end else None
print get_word(s)
Output: 'blablablbalba'

You can use rfind to search for the previous whitespace including s[i]:
>>> s = 'hello you blablablbalba qyosud'
>>> i = 17
>>> start = s.rfind(' ', 0, i + 1)
>>> start
9
Then you can use find to search the following whitespace again including s[i]:
>>> end = s.find(' ', i)
>>> end
23
And finally use slice to generate the word:
>>> s[start+1:(end if end != -1 else None)]
'blablablbalba'
Above will result to the word in case s[i] is not whitespace. In case s[i] is whitespace the result is empty string.

Related

Find the start and end position of a word in a string based on the index position of that word from a label list

I have a sentence
str = 'cold weather gives me cold'
and a list
tag = ['O','O','O','O','disease']
This indicates that 5th word in the sentence is a disease type. Now I need to get the starting and ending position of the 5th word.
If I just do string search with 'cold' it will give me the starting position of the "cold" which occurs first.
This should do it.
def get(str,target_index):
start = len(" ".join(str.split(" ")[:target_index])) + 1
end = start + len(str.replace('.','').split(' ')[target_index])
return (start,end)
str = 'cold weather gives me cold.'
tag = ['O','O','O','O','disease']
start,end = get(str,tag.index('disease'))
print(start,end,str[start:end]) # outputs 22 26 cold
str = 'cold weather gives me cold'
tag = ['O','O','O','O','disease']
start,end = get(str,tag.index('disease'))
print(start,end,str[start:end]) # outputs 22 26 cold
str = 'cold weather gives me cold and cough'
tag = ['O','O','O','O','disease']
start,end = get(str,tag.index('disease'))
print(start,end,str[start:end]) # outputs 22 26 cold
See it in action here.
Hope it helps!
First find the disease index from tag then disease name from data then find start and end index:
strData = 'cold weather gives me cold'
tag = ['O','O','O','O','disease']
diseaseIndex = tag.index('disease')
diseaseName = strData.split()[diseaseIndex]
print(diseaseName)
diseaseNameStartIndex = sum(len(word) for (index, word) in enumerate(strData.split()) if index< diseaseIndex ) + diseaseIndex
diseaseNameEndIndex = diseaseNameStartIndex + len(diseaseName) -1
print("diseaseNameStartIndex = ",diseaseNameStartIndex)
print("diseaseNameEndIndex = ",diseaseNameEndIndex)
output:
cold
diseaseNameStartIndex = 22
diseaseNameEndIndex = 25
You could simply split the string and then join it again, but this is somewhat awkward.
string_list = string.split(" ")
word_start = len(" ".join(string_list[:4])) + 1
word_end = word_start + len(string_list[4])
The following will output the start and end position of a given word, assuming words are separated by spaces:
str = 'cold weather gives me cold'
word_idx = 4 # index of the word we are looking for
split_str = str.split(' ')
print(split_str[word_idx]) # outputs 'cold'
start_pos = 0
for i in range(word_idx):
start_pos += len(split_str[i]) + 1 # add one because of the spaces between words
end_pos = start_pos + len(split_str[word_idx]) - 1
print(start_pos) # prints 22
print(end_pos) # prints 25
Using itertools and re:
import re
from itertools import accumulate
def find_index(string, n):
words = string.split()
len_word = len(words[n])
end_index = list(accumulate(map(len, re.split('(\s)' , string))))[::2][n]
return end_index - len_word, end_index - 1
Using it:
find_index('cold weather gives me cold', 4) #5th word means 4 in indexing
Output:
(22, 25)
Try using this function:
def find_index(s, n):
length = len(s.split()[n])
index = [(0, len(s.split()[0]) - 1)]
for i in s.split():
index.append((index[-1][0] + len(i), index[-1][1] + len(i)))
return index[n + 1]
print(find_index('cold weather gives me cold', 4))
Output:
(22, 25)
If you have to do this for a long line, it is better to use an iterator, which would generate the word starting and ending position using re.finditer method, and then find the nth element of the iterator using islice
>>> str = 'cold weather gives me cold'
>>> word_pos = iter((match.group(), match.span(1)) for match in re.finditer(r'(\S+)\S', string))
>>>
>>> n=4
>>> next(islice(word_pos, n, n+1))
('cold', (22, 25))
You can use re with a list comprehension:
import re
s = 'cold weather gives me cold'
new_s = re.findall('\w+|\s+', s)
l = [(a, sum(map(len, new_s[:i]))) for i, a in enumerate(new_s) if a != ' ']
tag = ['O','O','O','O','disease']
result = [[c if not c else c, c+len(d)] for a, [d, c] in zip(tag, l) if a == 'disease']
Output:
[[22, 26]]

How to copy spaces from one string to another in Python?

I need a way to copy all of the positions of the spaces of one string to another string that has no spaces.
For example:
string1 = "This is a piece of text"
string2 = "ESTDTDLATPNPZQEPIE"
output = "ESTD TD L ATPNP ZQ EPIE"
Insert characters as appropriate into a placeholder list and concatenate it after using str.join.
it = iter(string2)
output = ''.join(
[next(it) if not c.isspace() else ' ' for c in string1]
)
print(output)
'ESTD TD L ATPNP ZQ EPIE'
This is efficient as it avoids repeated string concatenation.
You need to iterate over the indexes and characters in string1 using enumerate().
On each iteration, if the character is a space, add a space to the output string (note that this is inefficient as you are creating a new object as strings are immutable), otherwise add the character in string2 at that index to the output string.
So that code would look like:
output = ''
si = 0
for i, c in enumerate(string1):
if c == ' ':
si += 1
output += ' '
else:
output += string2[i - si]
However, it would be more efficient to use a very similar method, but with a generator and then str.join. This removes the slow concatenations to the output string:
def chars(s1, s2):
si = 0
for i, c in enumerate(s1):
if c == ' ':
si += 1
yield ' '
else:
yield s2[i - si]
output = ''.join(char(string1, string2))
You can try insert method :
string1 = "This is a piece of text"
string2 = "ESTDTDLATPNPZQEPIE"
string3=list(string2)
for j,i in enumerate(string1):
if i==' ':
string3.insert(j,' ')
print("".join(string3))
outout:
ESTD TD L ATPNP ZQ EPIE

Python: replace string, matched from a list

Trying to match and mark character based n-grams. The string
txt = "how does this work"
is to be matched with n-grams from the list
ngrams = ["ow ", "his", "s w"]
and marked with <> – however, only if there is no preceding opened quote. The output i am seeking for this string is h<ow >does t<his w>ork (notice the double match in the 2-nd part, but within just 1 pair of expected quotes).
The for loop i’ve tried for this doesn’t, however, produce the wanted output at all:
switch = False
for i in txt:
if i in "".join(ngrams) and switch == False:
txt = txt.replace(i, "<" + i)
switch = True
if i not in "".join(ngrams) and switch == True:
txt = txt.replace(i, ">" + i)
switch = False
print(txt)
Any help would be greatly appreciated.
This solution uses the str.find method to find all copies of an ngram within the txt string, saving the indices of each copy to the indices set so we can easily handle overlapping matches.
We then copy txt, char by char to the result list, inserting angle brackets where required. This strategy is more efficient than inserting the angle brackets using multiple .replace call because each .replace call needs to rebuild the whole string.
I've extended your data slightly to illustrate that my code handles multiple copies of an ngram.
txt = "how does this work now chisolm"
ngrams = ["ow ", "his", "s w"]
print(txt)
print(ngrams)
# Search for all copies of each ngram in txt
# saving the indices where the ngrams occur
indices = set()
for s in ngrams:
slen = len(s)
lo = 0
while True:
i = txt.find(s, lo)
if i == -1:
break
lo = i + slen
print(s, i)
indices.update(range(i, lo-1))
print(indices)
# Copy the txt to result, inserting angle brackets
# to show matches
switch = True
result = []
for i, u in enumerate(txt):
if switch:
if i in indices:
result.append('<')
switch = False
result.append(u)
else:
result.append(u)
if i not in indices:
result.append('>')
switch = True
print(''.join(result))
output
how does this work now chisolm
['ow ', 'his', 's w']
ow 1
ow 20
his 10
his 24
s w 12
{1, 2, 10, 11, 12, 13, 20, 21, 24, 25}
h<ow >does t<his w>ork n<ow >c<his>olm
If you want adjacent groups to be merged, we can easily do that using the str.replace method. But to make that work properly we need to pre-process the original data, converting all runs of whitespace to single spaces. A simple way to do that is to split the data and re-join it.
txt = "how does this\nwork now chisolm hisow"
ngrams = ["ow", "his", "work"]
#Convert all whitespace to single spaces
txt = ' '.join(txt.split())
print(txt)
print(ngrams)
# Search for all copies of each ngram in txt
# saving the indices where the ngrams occur
indices = set()
for s in ngrams:
slen = len(s)
lo = 0
while True:
i = txt.find(s, lo)
if i == -1:
break
lo = i + slen
print(s, i)
indices.update(range(i, lo-1))
print(indices)
# Copy the txt to result, inserting angle brackets
# to show matches
switch = True
result = []
for i, u in enumerate(txt):
if switch:
if i in indices:
result.append('<')
switch = False
result.append(u)
else:
result.append(u)
if i not in indices:
result.append('>')
switch = True
# Convert the list to a single string
output = ''.join(result)
# Merge adjacent groups
output = output.replace('> <', ' ').replace('><', '')
print(output)
output
how does this work now chisolm hisow
['ow', 'his', 'work']
ow 1
ow 20
ow 34
his 10
his 24
his 31
work 14
{32, 1, 34, 10, 11, 14, 15, 16, 20, 24, 25, 31}
h<ow> does t<his work> n<ow> c<his>olm <hisow>
This should work:
txt = "how does this work"
ngrams = ["ow ", "his", "s w"]
# first find where letters match ngrams
L = len(txt)
match = [False]*L
for ng in ngrams:
l = len(ng)
for i in range(L-l):
if txt[i:i+l] == ng:
for j in range(l):
match[i+j] = True
# then sandwich matches with quotes
out = []
switch = False
for i in range(L):
if not switch and match[i]:
out.append('<')
switch = True
if switch and not match[i]:
out.append('>')
switch = False
out.append(txt[i])
print "".join(out)
Here's a method with only one for loop. I timed it and it's about as fast as the other answers to this question. I think it's a bit more clear, although that might be because I wrote it.
I iterate over the index of the first character in the n-gram, then if it matches, I use a bunch of if-else clauses to see whether I should add a < or > in this situation. I add to the end of the string output from the original txt, so I'm not really inserting in the middle of a string.
txt = "how does this work"
ngrams = set(["ow ", "his", "s w"])
n = 3
prev = -n
output = ''
shift = 0
open = False
for i in xrange(len(txt) - n + 1):
ngram = txt[i:i + n]
if ngram in ngrams:
if i - prev > n:
if open:
output += txt[prev:prev + n] + '>' + txt[prev + n:i] + '<'
elif not open:
if prev > 0:
output += txt[prev + n:i] + '<'
else:
output += txt[:i] + '<'
open = True
else:
output += txt[prev:i]
prev = i
if open:
output += txt[prev:prev + n] + '>' + txt[prev + n:]
print output

Python string split join 4

import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res += " ".join(i)
count += 1
print(res)
order(string)
Above there is a code which I have problem with. Output which I should get is:
"Thi1s is2 3a T4est"
Instead I'm getting the correct order but with spaces in the wrong places:
"T h i 1 si s 23 aT 4 e s t"
Any idea how to make it work with this code concept?
You are joining the characters of each word:
>>> " ".join('Thi1s')
'T h i 1 s'
You want to collect your words into a list and join that instead:
def order(sentence):
number_words = []
count = 1
words = sentence.split()
for word in words:
for word in words:
matches = re.findall('\d+', word)
if matches == [str(count)]:
number_words.append(word)
count += 1
result = ' '.join(number_words)
print(result)
I used more verbose and clear variable names. I also removed the list variable; don't use list as a variable name if you can avoid it, as that masks the built-in list name.
What you implemented comes down to a O(N^2) (quadratic time) sort. You could instead use the built-in sort() function to bring this to O(NlogN); you'd extract the digit and sort on its integer value:
def order(sentence):
digit = re.compile(r'\d+')
return ' '.join(
sorted(sentence.split(),
key=lambda w: int(digit.search(w).group())))
This differs a little from your version in that it'll only look at the first (consecutive) digits, it doesn't care about the numbers being sequential, and will break for words without digits. It also uses a return to give the result to the caller rather than print. Just use print(order(string)) to print the return value.
If you assume the words are numbered consecutively starting at 1, then you can sort them in O(N) time even:
def order(sentence):
digit = re.compile(r'\d+')
words = sentence.split()
result = [None] * len(words)
for word in words:
index = int(digit.search(word).group())
result[index - 1] = word
return ' '.join(result)
This works by creating a list of the same length, then using the digits from each word to put the word into the correct index (minus 1, as Python lists start at 0, not 1).
I think the bug is simply in the misuse of join(). You want to concatenate the current sorted string. i is simply a token, hence simply add it to the end of the string. Code untested.
import re
string = "is2 Thi1s T4est 3a"
def order(sentence):
res = ''
count = 1
list = sentence.split()
for i in list:
for i in list:
a = re.findall('\d+', i)
if a == [str(count)]:
res = res + " " + i # your bug here
count += 1
print(res)
order(string)

How to find and get rid of consecutive repeated punctuation signs without using regular expressions in python?

I want to get rid of repeated consecutive punctuation signs and only leave one of them.
If I have
string = 'Is it raining????',
I want to get
string = 'Is it raining?'
But I don't want to get rid of '...'
I also need to do this without using regular expressions. I am a beginner in python and would appreciate any advice or hint. Thanks :)
Yet another groupby approach:
from itertools import groupby
from string import punctuation
punc = set(punctuation) - set('.')
s = 'Thisss is ... a test!!! string,,,,, with 1234445556667 rrrrepeats????'
print(s)
newtext = []
for k, g in groupby(s):
if k in punc:
newtext.append(k)
else:
newtext.extend(g)
print(''.join(newtext))
output
Thisss is ... a test!!! string,,,,, with 1234445556667 rrrrepeats????
Thisss is ... a test! string, with 1234445556667 rrrrepeats?
import string
from itertools import groupby
# get all punctuation minus period.
puncs = set(string.punctuation)-set('.')
s = 'Is it raining???? No but...,,,, it is snowing!!!!!!!###!######'
# get count of consecutive characters
t = [[k,len(list(g))] for k, g in groupby(s)]
s = ''
for ele in t:
char = ele[0]
count = ele[1]
if char in puncs and count > 1:
count = 1
s+=char*count
print s
#Is it raining? No but..., it is snowing!#!###
How about the following kind of approach:
import string
text = 'Is it raining???? No,,,, but...,,,, it is snoooowing!!!!!!!'
for punctuation in string.punctuation:
if punctuation != '.':
while True:
replaced = text.replace(punctuation * 2, punctuation)
if replaced == text:
break
text = replaced
print(text)
This would give the following output:
Is it raining? No, but..., it is snoooowing!
Or for a more efficient version giving the same results:
import string
text = 'Is it raining???? No,,,, but...,,,, it is snoooowing!!!!!!!'
last = None
output = []
for c in text:
if c == '.':
output.append(c)
elif c != last:
if c in string.punctuation:
last = c
else:
last = None
output.append(c)
print(''.join(output))
from itertools import groupby
s = 'Is it raining???? okkkk!!! ll... yeh""" ok?'
replaceables = [ch for i, ch in enumerate(s) if i > 0 and s[i - 1] == ch and (not ch.isalpha() and ch != '.')]
replaceables = [list(g) for k, g in groupby(replaceables)]
start = 0
for replaceable in replaceables:
replaceable = ''.join(replaceable)
start = s.find(replaceable, start)
r = s[start:].replace(replaceable, '', 1)
s = s.replace(s[start:], r)
print s

Categories