pandas: match substring from a column in dataframe with another dataframe column - python

I have two dataframe like the following but with more rows:
data = {'First': [['First', 'value'],['second','value'],['third','value','is'],['fourth','value','is']],
'Second': [['adj','noun'],['adj','noun'],['adj','noun','verb'],['adj','noun','verb']]}
df = pd.DataFrame (data, columns = ['First','Second'])
data2 = {'example': ['First value is important', 'second value is imprtant too','it us goof to know']}
df2 = pd.DataFrame (data2, columns = ['example'])
I wrote a function that checks if the first word in the example column can be found in the First column in the first dataframe, and if true return the string, like the following:
def reader():
for l in [l for l in df2.example]:
if df["first"].str.contains(pat=l.split(' ', 1)[0]).any() is True:
return l
However, i realized that it would not work because the First column in df is a list of strings, so I made the following modification:
def reader():
for l in [l for l in df2.example]:
df['first_unlist'] = [','.join(map(str, l)) for l in df.First]
if df["first_unlist"].str.contains(pat=l.split(' ', 1)[0]).any() is True:
return l
however, i still get 'None' when i run the function, and I cannot figure out what is wrong here.
Update:
I would like the function to return the first two strings in the example column, 'First value is important', 'second value is imprtant too'

Your function doesn's return False when the first word in the example column can not be found. Here is the revision.
def reader():
for l in [l for l in df2.example]:
df['first_unlist'] = [','.join(map(str, l)) for l in df.First]
if df["first_unlist"].str.contains(pat=l.split(' ', 1)[0]).any() is True:
return l
return list(df2.example[:2])
reader()

Related

How to check if any of elements in a dictionary value is in string?

I have a dataframe with strings and a dictionary which values are lists of strings.
I need to check if each string of the dataframe contains any element of every value in the dictionary. And if it does, I need to label it with the appropriate key from the dictionary. All I need to do is to categorize all the strings in the dataframe with keys from the dictionary.
For example.
df = pd.DataFrame({'a':['x1','x2','x3','x4']})
d = {'one':['1','aa'],'two':['2','bb']}
I would like to get something like this:
df = pd.DataFrame({
'a':['x1','x2','x3','x4'],
'Category':['one','two','x3','x4']})
I tried this, but it has not worked:
df['Category'] = np.nan
for k, v in d.items():
for l in v:
df['Category'] = [k if l in str(x).lower() else x for x in df['a']]
Any ideas appreciated!
Firstly create a function that do this for you:-
def func(val):
for x in range(0,len(d.values())):
if val in list(d.values())[x]:
return list(d.keys())[x]
Now make use of split() and apply() method:-
df['Category']=df['a'].str.split('',expand=True)[2].apply(func)
Finally use fillna() method:-
df['Category']=df['Category'].fillna(df['a'])
Now if you print df you will get your expected output:-
a Category
0 x1 one
1 x2 two
2 x3 x3
3 x4 x4
Edit:
You can also do this by:-
def func(val):
for x in range(0,len(d.values())):
if any(l in val for l in list(d.values())[x]):
return list(d.keys())[x]
then:-
df['Category']=df['a'].apply(func)
Finally:-
df['Category']=df['Category'].fillna(df['a'])
I've come up with the following heuristic, which looks really dirty.
It outputs what you desire, albeit with some warnings, since I've used indices to append values to dataframe.
import pandas as pd
import numpy as np
def main():
df = pd.DataFrame({'a': ['x1', 'x2', 'x3', 'x4']})
d = {'one': ['1', 'aa'], 'two': ['2', 'bb']}
found = False
i = 0
df['Category'] = np.nan
for x in df['a']:
for k,v in d.items():
for item in v:
if item in x:
df['Category'][i] = k
found = True
break
else:
df['Category'][i] = x
if found:
found = False
break
i += 1
print(df)
main()

Can't append Column Names of Dataframe To A List Using For Loop

I have a Dataframe like this:-
data = [['a', 'b', 'c', 'd'],['q', 'r', 's', 't'],['n'],['w', 'x', 'y', 'z']]
df = pd.DataFrame(data, columns = ['Full_1', 'Full_2', 'Full_3', 'Full_4'])
Now I want to append the columns of a dataframe which contains 'None' value using for loop inside a function
lst=[]
def lister(df):
for c in df.columns:
if (df[c].isna().max())==True:
lst.append(c)
return lst
else:
nope = 'None'
return nope
It returns me 'None' intsead of lst
Now If I print c Inside of for loop i.e
lst=[]
def lister(df):
for c in df.columns:
if (df[c].isna().max())==True:
print(c)
#return lst
else:
nope = 'None'
#return nope
Output of c inside for loop:-
Full_2
Full_3
Full_4
So Why these values not appending in list named lst?
Expected output of lst:-
['Full_2','Full_3','Full_4']
>>> df.columns[df.isna().any()].to_list()
['Full_2', 'Full_3', 'Full_4']
Edit: update your function like this.
def lister(df):
lst = []
for c in df.columns:
if (df[c].isna().max()) == True:
lst.append(c)
return lst
>>> lister(df)
['Full_2', 'Full_3', 'Full_4']
You're initialising the list every time.
So it resets to an empty list inside the if statement.
Move your lst=[] line outside of the for loop.

Find Anagram in Pandas DataFrame columns

Given the Dataframe
df = pd.DataFrame({'word1': ['elvis', 'lease', 'admirer'], 'word2': ['lives', 'sale', 'married']})
how can I add a third column that returns True or False depending on whether the two words in the same row are an anagram or not?
I have written this function, which returns an error when I apply it to the df.
def anagram(word1, word2):
word1_lst = [l for l in word1]
word2_lst = [i for i in word2]
return sorted(word1_lst) == sorted(word2_lst)
df['Anagram'] = df.apply(anagram(df['word1'], df['word2']), axis = 1)
TypeError: 'bool' object is not callable
df = pd.DataFrame({'word1': ['elvis', 'lease', 'admirer'], 'word2': ['lives', 'sale', 'married']})
df['Anagram'] = df.word1.apply(sorted) == df.word2.apply(sorted)
The issue here is that you are calling df.apply() with the args
anagram(df['word1'], df['word2') which is a bool, not a function
and
axis = 1
To fix, alter your function like this:
def anagram(row):
word1_lst = [l for l in row['word1']]
word2_lst = [i for i in row['word2']]
return sorted(word1_lst) == sorted(word2_lst)
then call the method with the function name, not the result
df['Anagram'] = df.apply(anagram, axis=1)

Pandas return index name and column name function

I am looking for pandas apply function, from which can return the tuple (index_name, column_name,value). The value is the entry in that row,col specified.
Something like the following function,
def pair(val):
return zip(index.name,column.name,val)
If you want a list with tuples for every value of your dataframe df, you can try something like this:
my_list = []
for index, row in df.iterrows():
for col in df.columns:
my_list.append((index, col, row[col]))
If you want a list with all tuples matching a single value, this function should work:
def findTuples(value, df):
my_list = []
for col in df.columns:
# if at least one such value in column
if df[df[col] == value].shape[0] != 0:
my_list = my_list + list(
df[df[col] == value].apply(
lambda x: (x.name, col, x[col]),
axis=1
)
)
return my_list

Find co-occurrences of two lists in python

I have got two lists. The first one contains names and second one names and corresponding values. The names of the first list in a subset of the name of the name of the second lists. The values are a true or false. I want to find the co-occurrences of the names of both lists and count the true values. My code:
data1 = [line.strip() for line in open("text_files/first_list.txt", 'r')]
ins = open( "text_files/second_list.txt", "r" ) # the "r" is not really needed - default
parseTable = []
for line in ins:
row = line.rstrip().split(' ') # <- note use of rstrip()
parseTable.append(row)
new_data = []
indexes = []
for index in range(len(parseTable)):
new_data.append(parseTable[index][0])
indexes.append(parseTable[index][1])
in1 =return_indices_of_a(new_data, data1)
def return_indices_of_a(a, b):
b_set = set(b)
return [i for i, v in enumerate(a) if v in b_set] #return the co-occurrences
I am reading both text files which containing the lists, i found the co-occurrences and then I want to keep from the parseTable[][1] only the in1 indices . Am I doing it right? How can I keep the indices I want? My two lists:
['SITNC', 'porkpackerpete', 'teensHijab', '1DAlert', 'IsmodoFashion',....
[['SITNC', 'true'], ['1DFAMlLY', 'false'], ['tibi', 'true'], ['1Dneews', 'false'], ....
Here's a one liner to get the matches:
matches = [(name, dict(values)[name]) for name in set(names) if name in dict(values)]
and then to get the number of true matches:
len([name for (name, value) in matches if value == 'true'])
Edit
You might want to move dict(values) into a named variable:
value_map = dict(values)
matches = [(name, value_map[name]) for name in set(names) if name in value_map]
There are two ways, one is what Andrey suggests (you may want to convert names to set), or, alternatively, convert the second list into a dictionary:
mapping = dict(values)
sum_of_true = sum(mapping[n] for n in names)
The latter sum works because bool is essentially int in Python (True == 1).
If you need just the sum of true values, then use in operator and list comprehension:
In [1]: names = ['SITNC', 'porkpackerpete', 'teensHijab', '1DAlert', 'IsmodoFashion']
In [2]: values = [['SITNC', 'true'], ['1DFAMlLY', 'false'], ['tibi', 'true'], ['1Dneews', 'false']]
In [3]: sum_of_true = len([v for v in values if v[0] in names and v[1] == "true"])
In [4]: sum_of_true
Out[4]: 1
To get also indices of co-occurrences, this one-liner may come in handy:
In [6]: true_indices = [names.index(v[0]) for v in values if v[0] in names and v[1] == "true"]
In [7]: true_indices
Out[7]: [0]

Categories