split string row wise with condition in python - python

I have some strings in a column and I want to explode the words out only if they are not within brackets. The column looks like this
pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
and I want the output to look like this
pd.DataFrame(data={'a': ['first','string','(second,string)','third','string','(another,string,here)']})
This sort of works, but i would like to not have to put the row number in each time
re.split(r',(?![^()]*\))', x['a'][0])
re.split(r',(?![^()]*\))', x['a'][1])
re.split(r',(?![^()]*\))', x['a'][2])
i thought i could do with a lmbda function but i cannot get it to work. Thanks for checking this out
x['a'].apply(lambda i: re.split(r',(?![^()]*\))', i))

It is not clear to me if the elements in your DataFrame may have multiple groups between brackets. Given that doubt, I have implemented the following:
import pandas as pd
import re
df = pd.DataFrame(data={'a': ['first,string','(second,string)','third,string (another,string,here)']})
pattern = re.compile("([^\(]*)([\(]?.*[\)]?)(.*)", re.IGNORECASE)
def findall(ar, res = None):
if res is None:
res = []
m = pattern.findall(ar)[0]
if len(m[0]) > 0:
res.extend(m[0].split(","))
if len(m[1]) > 0:
res.append(m[1])
if len(m[2]) > 0:
return findall(ar[2], res = res)
else:
return res
res = []
for x in df["a"]:
res.extend(findall(x))
print(pd.DataFrame(data={"a":res}))
Essentially, you recursively scan the last part of the match until you find no more words between strings. If order was not an issue, the solution is easier.

Related

recursively merging rows pandas dataframe based on the condition

community,
I have a sorted pandas dataframe that looks as following:
I want to merge rows that have overlapping values in start and end columns. Meaning that if the end value of initial row is bigger than start value of the sequential one or any othe sequential, they will be merged into one row. Examples are rows 3, 4 and 5. Output I would expect is:
To do so, I am trying to implement recursion function, that would loop over the dataframe until condition worsk and then return me a number that would be used to search location for the end row .
However, the functioin I am trying to implement, returns me empty dataframe. Could you help me please, where should I put attention, or what alternative can I build if recurtion is not a solution?
def row_merger(pd_df):
counter = 0
new_df = pd.DataFrame(columns=pd_df.columns)
for i in range(len(pd_df) - 1):
def recursion_inside(pd_df, counter = 0):
counter = 0
if pd_df.iloc[i + 1 + counter]["q.start"] <= pd_df.iloc[i]["q.end"]:
counter = counter+1
recursion_inside(pd_df, counter)
else:
return counter
new_row = {"name": pd_df["name"][i], "q.start": pd_df.iloc[i]
["q.start"], "q.end": pd_df.iloc[i+counter]["q.start"]}
new_df.append(new_row, ignore_index=True)
return new_df
I don't see the benefit of using recursion here, so I would just iterate over the rows instead, building up the rows for the output dataframe one by one, e.g. like this:
def row_merger(df_in):
if len(df_in) <= 1:
return df_in
rows_out = []
current_row = df_in.iloc[0].values
for next_row in df_in.iloc[1:].values:
if next_row[1] > current_row[2]:
rows_out.append(current_row)
current_row = next_row
else:
current_row[2] = max(current_row[2], next_row[2])
rows_out.append(current_row)
return pd.DataFrame(rows_out, columns=df_in.columns)

Iterate over list

Q6
4;99
3;4;8;9;14;18
2;3;8;12;18
2;3;11;18
2;3;8;18
2;3;4;5;6;7;8;9;11;12;15;16;17;18
2;3;4;8;9;10;11;13;18
1;3;4;5;6;7;13;16;17
2;3;4;5;6;7;8;9;11;12;14;15;18
3;11;18
2;3;5;8;9;11;12;13;15;16;17;18
2;5;11;18
1;2;3;4;5;8;9;11;17;18
3;7;8;11;13;14
2;3;8;18
2;13
2;3;5;8;9;11;12;13;18
2;3;4;9;11;12;18
2;3;5;9;11;18
1;2;3;4;5;6;7;8;9;11;14;15;16;17;18
2;3;8;11;13;18
import pandas as pd
df_1 = pd.read_csv('amazon_final 29082018.csv')
list_6 = list(df_1["Q6"])
list_6 = list(map(str, list_6))
list_7 = list(zip(list_6))
tem_list = []
for x in list_6:
if ('3' in x[0]):
tem_list.append('Fire')
else:
tem_list.append(None)
df_1.to_csv('final.csv', index=False)
I have many such columns in data.
I want to extract value '3' from this, the code which i wrote is give giving me 3 value along with 13,23,33 so on. I only want count of rows having value 3.
You need to break up the rows and convert each value to an integer. At the moment you are looking for the presence of the string "3" which is why strings like "2;13" pass the test. Try something like this:
list_6 = ["4;99", "3;4;8;9;14;18", "2;3;8;12;18", "2;3;11;18", "2;3;8;18",
"2;3;4;5;6;7;8;9;11;12;15;16;17;18", "2;3;4;8;9;10;11;13;18",
"1;3;4;5;6;7;13;16;17", "2;3;4;5;6;7;8;9;11;12;14;15;18", "3;11;18",
"2;3;5;8;9;11;12;13;15;16;17;18", "2;5;11;18", "1;2;3;4;5;8;9;11;17;18",
"3;7;8;11;13;14", "2;3;8;18", "2;13", "2;3;5;8;9;11;12;13;18",
"2;3;4;9;11;12;18", "2;3;5;9;11;18",
"1;2;3;4;5;6;7;8;9;11;14;15;16;17;18", "2;3;8;11;13;18"]
temp_list = []
for x in list_6:
numbers = [int(num_string) for num_string in x.split(';')]
if (3 in numbers):
temp_list.append('Fire')
else:
temp_list.append('None')
print(temp_list)

Populating python matrix

I'm doing the splitting of the words from the text file in python. I've receive the number of row (c) and a dictionary (word_positions) with index. Then I create a zero matrix (c, index). Here is the code:
from collections import defaultdict
import re
import numpy as np
c=0
f = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r')
for line in f:
c = c + 1
word_positions = {}
with open('/Users/Half_Pint_Boy/Desktop/sentenses.txt', 'r') as f:
index = 0
for word in re.findall(r'[a-z]+', f.read().lower()):
if word not in word_positions:
word_positions[word] = index
index += 1
print(word_positions)
matrix=np.zeros(c,index)
My question: How can I populate the matrix to be able to get this: matrix[c,index] = count, where c - is the number of row, index -the indexed position and count -the number of counted words in a row
Try next:
import re
import numpy as np
from itertools import chain
text = open('/Users/Half_Pint_Boy/Desktop/sentenses.txt')
text_list = text.readlines()
c=0
for i in range(len(text_list)):
c=c+1
text_niz = []
for i in range(len(text_list)):
text_niz.append(text_list[i].lower()) # перевел к нижнему регистру
slovo = []
for j in range(len(text_niz)):
slovo.append(re.split('[^a-z]', text_niz[j])) # токенизация
for e in range(len(slovo)):
while slovo[e].count('') != 0:
slovo[e].remove('') # удалил пустые слова
slovo_list = list(chain(*slovo))
print (slovo_list) # составил список слов
slovo_list=list(set(slovo_list)) # удалил повторяющиеся
x=len(slovo_list)
s = []
for i in range(len(slovo)):
for j in range(len(slovo_list)):
s.append(slovo[i].count(slovo_list[j])) # посчитал количество слов в каждом предложении
matr = np.array(s) # матрица вхождений слов в предложения
d = matr.reshape((c, x)) # преобразовал в матрицу 22*254
It looks like you are trying to create something similar to an n-dimensional list. these are achieved by nesting lists inside themselves as such:
two_d_list = [[0, 1], [1, 2], [example, blah, blah blah]]
words = two_d_list[2]
single_word = two_d_list[2][1] # Notice the second index operator
This concept is very flexible in Python and can also be done with a dictionary nested inside as you would like:
two_d_list = [{"word":1}, {"example":1, "blah":3}]
words = two_d_list[1] # type(words) == dict
single_word = two_d_list[2]["example"] # Similar index operator, but for the dictionary
This achieves what you would like, functionally, but does not use the syntax matrix[c,index], however this syntax does not really exist in python for indexing. Commas within square-brackets usually delineate the elements of list literals. Instead you can access the row's dictionary's element with matrix[c][index] = count
You may be able to overload the index operator to achieve the syntx you want. Here is a question about achieving the syntax you desire. In summary:
Overload the __getitem__(self, inex) function in a wrapper of the list class and set the function to accept a tuple. The tuple can be created without parenthesis, giving the syntax matrix[c, index] = count

Efficiently update columns based on one of the columns split value

So here is my code updating many column values based on a condition of split values of the column 'location'. The code works fine, but as its iterating by row it's not efficient enough. Can anyone help me to make this code work faster please?
for index, row in df.iterrows():
print index
location_split =row['location'].split(':')
after_county=False
after_province=False
for l in location_split:
if l.strip().endswith('ED'):
df[index, 'electoral_district'] = l
elif l.strip().startswith('County'):
df[index, 'county'] = l
after_county = True
elif after_province ==True:
if l.strip()!='Ireland':
df[index, 'dublin_postal_district'] = l
elif after_county==True:
df[index, 'province'] = l.strip()
after_province = True
'map' was what I needed :)
def fill_county(column):
res = ''
location_split = column.split(':')
for l in location_split:
if l.strip().startswith('County'):
res= l.strip()
break
return res
df['county'] = map(fill_county, df['location'])

Python: How to replace N random string occurrences in text?

Say that I have 10 different tokens, "(TOKEN)" in a string. How do I replace 2 of those tokens, chosen at random, with some other string, leaving the other tokens intact?
>>> import random
>>> text = '(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)'
>>> token = '(TOKEN)'
>>> replace = 'foo'
>>> num_replacements = 2
>>> num_tokens = text.count(token) #10 in this case
>>> points = [0] + sorted(random.sample(range(1,num_tokens+1),num_replacements)) + [num_tokens+1]
>>> replace.join(token.join(text.split(token)[i:j]) for i,j in zip(points,points[1:]))
'(TOKEN)__(TOKEN)__(TOKEN)__(TOKEN)__foo__(TOKEN)__foo__(TOKEN)__(TOKEN)__(TOKEN)'
In function form:
>>> def random_replace(text, token, replace, num_replacements):
num_tokens = text.count(token)
points = [0] + sorted(random.sample(range(1,num_tokens+1),num_replacements)) + [num_tokens+1]
return replace.join(token.join(text.split(token)[i:j]) for i,j in zip(points,points[1:]))
>>> random_replace('....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....','(TOKEN)','FOO',2)
'....FOO....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....(TOKEN)....FOO....'
Test:
>>> for i in range(0,9):
print random_replace('....(0)....(0)....(0)....(0)....(0)....(0)....(0)....(0)....','(0)','(%d)'%i,i)
....(0)....(0)....(0)....(0)....(0)....(0)....(0)....(0)....
....(0)....(0)....(0)....(0)....(1)....(0)....(0)....(0)....
....(0)....(0)....(0)....(0)....(0)....(2)....(2)....(0)....
....(3)....(0)....(0)....(3)....(0)....(3)....(0)....(0)....
....(4)....(4)....(0)....(0)....(4)....(4)....(0)....(0)....
....(0)....(5)....(5)....(5)....(5)....(0)....(0)....(5)....
....(6)....(6)....(6)....(0)....(6)....(0)....(6)....(6)....
....(7)....(7)....(7)....(7)....(7)....(7)....(0)....(7)....
....(8)....(8)....(8)....(8)....(8)....(8)....(8)....(8)....
If you need exactly two, then:
Detect the tokens (keep some links to them, like index into the string)
Choose two at random (random.choice)
Replace them
What are you trying to do, exactly? A good answer will depend on that...
That said, a brute-force solution that comes to mind is to:
Store the 10 tokens in an array, such that tokens[0] is the first token, tokens[1] is the second, ... and so on
Create a dictionary to associate each unique "(TOKEN)" with two numbers: start_idx, end_idx
Write a little parser that walks through your string and looks for each of the 10 tokens. Whenever one is found, record the start/end indexes (as start_idx, end_idx) in the string where that token occurs.
Once done parsing, generate a random number in the range [0,9]. Lets call this R
Now, your random "(TOKEN)" is tokens[R];
Use the dictionary in step (3) to find the start_idx, end_idx values in the string; replace the text there with "some other string"
My solution in code:
import random
s = "(TOKEN)test(TOKEN)fgsfds(TOKEN)qwerty(TOKEN)42(TOKEN)(TOKEN)ttt"
replace_from = "(TOKEN)"
replace_to = "[REPLACED]"
amount_to_replace = 2
def random_replace(s, replace_from, replace_to, amount_to_replace):
parts = s.split(replace_from)
indices = random.sample(xrange(len(parts) - 1), amount_to_replace)
replaced_s_parts = list()
for i in xrange(len(parts)):
replaced_s_parts.append(parts[i])
if i < len(parts) - 1:
if i in indices:
replaced_s_parts.append(replace_to)
else:
replaced_s_parts.append(replace_from)
return "".join(replaced_s_parts)
#TEST
for i in xrange(5):
print random_replace(s, replace_from, replace_to, 2)
Explanation:
Splits string into several parts using replace_from
Chooses indexes of tokens to replace using random.sample. This returned list contains unique numbers
Build a list for string reconstruction, replacing tokens with generated index by replace_to.
Concatenate all list elements into single string
Try this solution:
import random
def replace_random(tokens, eqv, n):
random_tokens = eqv.keys()
random.shuffle(random_tokens)
for i in xrange(n):
t = random_tokens[i]
tokens = tokens.replace(t, eqv[t])
return tokens
Assuming that a string with tokens exists, and a suitable equivalence table can be constructed with a replacement for each token:
tokens = '(TOKEN1) (TOKEN2) (TOKEN3) (TOKEN4) (TOKEN5) (TOKEN6) (TOKEN7) (TOKEN8) (TOKEN9) (TOKEN10)'
equivalences = {
'(TOKEN1)' : 'REPLACEMENT1',
'(TOKEN2)' : 'REPLACEMENT2',
'(TOKEN3)' : 'REPLACEMENT3',
'(TOKEN4)' : 'REPLACEMENT4',
'(TOKEN5)' : 'REPLACEMENT5',
'(TOKEN6)' : 'REPLACEMENT6',
'(TOKEN7)' : 'REPLACEMENT7',
'(TOKEN8)' : 'REPLACEMENT8',
'(TOKEN9)' : 'REPLACEMENT9',
'(TOKEN10)' : 'REPLACEMENT10'
}
You can call it like this:
replace_random(tokens, equivalences, 2)
> '(TOKEN1) REPLACEMENT2 (TOKEN3) (TOKEN4) (TOKEN5) (TOKEN6) (TOKEN7) (TOKEN8) REPLACEMENT9 (TOKEN10)'
There are lots of ways to do this. My approach would be to write a function that takes the original string, the token string, and a function that returns the replacement text for an occurrence of the token in the original:
def strByReplacingTokensUsingFunction(original, token, function):
outputComponents = []
matchNumber = 0
unexaminedOffset = 0
while True:
matchOffset = original.find(token, unexaminedOffset)
if matchOffset < 0:
matchOffset = len(original)
outputComponents.append(original[unexaminedOffset:matchOffset])
if matchOffset == len(original):
break
unexaminedOffset = matchOffset + len(token)
replacement = function(original=original, offset=matchOffset, matchNumber=matchNumber, token=token)
outputComponents.append(replacement)
matchNumber += 1
return ''.join(outputComponents)
(You could certainly change this to use shorter identifiers. My style is somewhat more verbose than typical Python style.)
Given that function, it's easy to replace two random occurrences out of ten. Here's some sample input:
sampleInput = 'a(TOKEN)b(TOKEN)c(TOKEN)d(TOKEN)e(TOKEN)f(TOKEN)g(TOKEN)h(TOKEN)i(TOKEN)j(TOKEN)k'
The random module has a handy method for picking random items from a population (not picking the same item twice):
import random
replacementIndexes = random.sample(range(10), 2)
Then we can use the function above to replace the randomly-chosen occurrences:
sampleOutput = strByReplacingTokensUsingFunction(sampleInput, '(TOKEN)',
(lambda matchNumber, token, **keywords:
'REPLACEMENT' if (matchNumber in replacementIndexes) else token))
print sampleOutput
And here's some test output:
a(TOKEN)b(TOKEN)cREPLACEMENTd(TOKEN)e(TOKEN)fREPLACEMENTg(TOKEN)h(TOKEN)i(TOKEN)j(TOKEN)k
Here's another run:
a(TOKEN)bREPLACEMENTc(TOKEN)d(TOKEN)e(TOKEN)f(TOKEN)gREPLACEMENTh(TOKEN)i(TOKEN)j(TOKEN)k
from random import sample
mystr = 'adad(TOKEN)hgfh(TOKEN)hjgjh(TOKEN)kjhk(TOKEN)jkhjk(TOKEN)utuy(TOKEN)tyuu(TOKEN)tyuy(TOKEN)tyuy(TOKEN)tyuy(TOKEN)'
def replace(mystr, substr, n_repl, replacement='XXXXXXX', tokens=10, index=0):
choices = sorted(sample(xrange(tokens),n_repl))
for i in xrange(choices[-1]+1):
index = mystr.index(substr, index) + 1
if i in choices:
mystr = mystr[:index-1] + mystr[index-1:].replace(substr,replacement,1)
return mystr
print replace(mystr,'(TOKEN)',2)

Categories