Building dictionary of unique IDs for pairs of matching strings - python

I have a dataframe like this
#Test dataframe
import pandas as pd
import numpy as np
#Build df
titles = {'Title': ['title1', 'cat', 'dog']}
references = {'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]}
df = pd.DataFrame({'Title': ['title1', 'cat', 'dog'], 'References': [['donkey','chicken'],['title1','dog'],['bird','snake']]})
#Insert IDs for UNIQUE titles
title_ids = {'IDs':list(np.arange(0,len(df)) + 1)}
df['IDs'] = list(np.arange(0,len(df)) + 1)
df = df[['Title','IDs','References']]
and I want to generate IDs for the references column that looks like the data frame below. If there is a matching between the strings, assign the same ID as in the IDs column and if not, assign a new unique ID.
My first attempt is using the function
#Matching function
def string_match(string1,string2):
if string1 == string2:
a = 1
else:
a = 0
return a
and to loop over each string/title combination but this gets tricky with multiple for loops and if statements. Is there a better way I can do this that is more pythonic?

# Explode to one reference per row
references = df["References"].explode()
# Combine existing titles with new title from References
titles = pd.concat([df["Title"], references]).unique()
# Assign each title an index number
mappings = {t: i + 1 for i, t in enumerate(titles)}
# Map the reference to the index number and convert to list
df["RefIDs"] = references.map(mappings).groupby(level=0).apply(list)

Let us try with factorize
s = df['References'].explode()
s[:] = pd.concat([df['Title'],s]).factorize()[0][len(df['Title']):]
df['new'] = (s+1).groupby(level=0).agg(list)
Out[237]:
0 [4, 5]
1 [1, 3]
2 [6, 7]
Name: References, dtype: object

In addition to the answers this can also be done with the help of a function, apply and lambda:
id_info=dict(df[['Title','IDs']].values)
def check(title,ref):
new_id_ = max(id_info.values()) #get latest id
ids=[]
for i in ref:
if i in id_info: #if Reference value is defined before, get its id
new_id=id_info[i]
else:
new_id=new_id_ + 1 #define a new id if not defined before and update dictionary to get latest id in next steps
new_id_+=1
id_info.update({i:new_id})
ids.append(new_id)
return ids
df['new_id']=df.apply(lambda x: check(x['Title'],x['References']),axis=1)
print(df)
'''
Title IDs References RefIDs
0 title1 1 ['donkey', 'chicken'] [4, 5]
1 cat 2 ['title1', 'dog'] [1, 3]
2 dog 3 ['bird', 'snake'] [6, 7]
'''

Related

find the rows with more than 4 values in a list in a column

The dataframe I have, df:
name list
0 kfjh [[a,b,c],[d,f,h],[g,k,l]]
1 jhkg [[a,b,c],[d,f,h],[g,k,l],[f,k,j]]
2 khfg [[a,b,c],[g,k,l]]
3 khkjgr [[a,b,c],[d,f,h]]
4 kjrgjg [[d,f,h]]
5 jkdgr [[a,b,c],[d,f,h],[g,k,l, [g,j,l],[f,l,p]]
6 hgyr [[a,b,c],[d,kf,h],[g,k,l, [g,j,l],[f,l,p]]
7 jkgtjd [[f,l,p]]
8 nkjgrd [t,t,i]
if the list has more than 4 list, then I would like to get df1.
The desired output, df1 :
name list
5 jkdgr [[a,b,c],[d,f,h],[g,k,l, [g,j,l],[f,l,p]]
6 hgyr [[a,b,c],[d,kf,h],[g,k,l, [g,j,l],[f,l,p]]
and, df2:
name list
0 kfjh [[a,b,c],[d,f,h],[g,k,l]]
1 jhkg [[a,b,c],[d,f,h],[g,k,l],[f,k,j]]
2 khfg [[a,b,c],[g,k,l]]
3 khkjgr [[a,b,c],[d,f,h]]
4 kjrgjg [[d,f,h]]
7 jkgtjd [[f,l,p]]
8 nkjgrd [t,t,i]
You can do something like this if column list is a string. if the list is list of lists with every element as a string, you can change the split for only len of the array and compare to 4 to do it.
import pandas as pd
data = {
'name': ['kfjh', 'jhkg', 'khfg', 'khkjgr', 'kjrgjg', 'jkdgr', 'hgyr', 'jkgtjd', 'nkjgrd'],
'list': ['[[a,b,c],[d,f,h],[g,k,l]]', '[[a,b,c],[d,f,h],[g,k,l],[f,k,j]]', '[[a,b,c],[g,k,l]]', '[[a,b,c],[d,f,h]]', '[[d,f,h]]', '[[a,b,c],[d,f,h],[g,k,l],[g,j,l],[f,l,p]]', '[[a,b,c],[d,f,h],[g,kf,l],[g,j,l],[f,l,p]]', '[[f,l,p]]', '[t,t,i]']
}
df = pd.DataFrame(data)
df['drop'] = df.apply(lambda row : 'no' if len(row['list'].split('[')) > 6 else 'yes', axis = 1)
df1 = df.loc[df['drop'] == 'yes']
df2 = df.loc[df['drop'] == 'no']
df1 = df1.drop(columns=['drop'])
df2 = df2.drop(columns=['drop'])
print(df1)
print(df2)
Try this:
from ast import literal_eval
df.list.apply(literal_eval)
You can use map(len) to give the number of elements in a List in a column. So you could use:
df1 = df[df['list'].map(len) > 4]
df2 = df[df['list'].map(len) <= 4]
which gives the two sets of results you present
Simply iterate through the first dataframe, get list length by counting nested lists in a recursive method and add the new corresponding rows to another dataframe:
import pandas as pd
def count_lists(l):
return sum(1 + count_lists(i) for i in l if isinstance(i,list))
data = {'name': ['kfjh', 'jhkg', 'khfg', 'khkjgr', 'kjrgjg', 'jkdgr', 'hgyr', 'jkgtjd', 'nkjgrd'],
'list': [[['a','b','c'],['d','f','h'],['g','k','l']], [['a','b','c'],['d','f','h'],['g','k','l'],['f','k','j']],
[['a','b','c'],['g','k','l']], [['a','b','c'],['d','f','h']], [['d','f','h']],
[['a','b','c'],['d','f','h'],['g','k','l', ['g','j','l'],['f','l','p']]],
[['a','b','c'], ['d','kf','h'],['g','k','l', ['g','j','l'], ['f','l','p']]],[['f','l','p']],['t','t','i']]}
dframe = pd.DataFrame(data)
dframe1 = pd.DataFrame()
dframe2 = pd.DataFrame()
for i, j in dframe.iterrows():
if count_lists(j)-1 > 4:
dframe2 = dframe2.append(dframe.iloc[i])
else:
dframe1 = dframe1.append(dframe.iloc[i])
print("Dataframe1:\n", dframe1, "\n")
print("Dataframe2:\n", dframe2)
Result:

How can I use a list of data to match data in a dataframe?

I got a list of coordinates, and I need to match the coordinates in a dataframe which contains a unique id and index for each of the coordinates. I want to match the coordinates and print the id and index of each coordinates in the list.
e.g.
List_coords = [[1,2],[3,4],[5,6]]
df =
Index ID Coords
1 23 [1,2]
2 34 [3,4]
3 45 [4,5]
4 56 [5,6]
I expect to get something like 1-23, 2-34, 4-56 and save them to another list. How can I do this?
Is this you are looking for?
match = df['Coords'].isin(List_coords)
(df.loc[match, 'Index'].astype(str) + '-' + df.loc[match, 'ID'].astype(str)).tolist()
The output is
['1-23', '2-34', '4-56']
IIUC you want to get list from Index, ID columns by concatening them with '-' but only for those rows whose 'Coords' is in List_coords?
Then:
m = df['Coords'].isin(List_coords)
out = df.Index.astype(str).add('-').add(df.ID.astype(str))
out = out[m].tolist()
print(out):
['1-23', '2-34', '4-56']
I think you need,
List_coords = [[1,2],[3,4],[5,6]]
df_matched = df[df['Coords'].isin(List_coords)]
output = df_matched[["Index", "ID"]].astype(str).apply(lambda row: row.str.cat(sep="-"), axis=1).values.tolist()
print(output)
>> ['1-23', '2-34', '4-56']
You could use Pandas 'merge'. This solution is merging two DataFrames together: one with the ids + coordinates and another which is made from a list of the coordiantes being looked up.
import pandas as pd
# Create the parent DF
parent_df = pd.DataFrame([
[23, [1,2]],
[45, [4,5]],
[56, [5,6]],
[34, [3,4]]
], columns=['id', 'coordinates'])
# Set as string to perform merge
parent_df['coordinates'] = parent_df['coordinates'].astype(str)
# Take a list of input coords, set as a DF
input_coords = [[1,2],[3,4],[5,6],[99,99]]
list_of_list_of_input_coords = [[coord] for coord in input_coords]
input_coords_df = pd.DataFrame(list_of_list_of_input_coords, columns=['coordinates'])
input_coords_df['coordinates'] = input_coords_df['coordinates'].astype(str)
# Merge the DFs together
merged_df = input_coords_df.merge(parent_df, how='left', on=['coordinates'])
final_list = []
# Createa final list of the ID and coordinates
for index, row in merged_df.iterrows():
final_list.append([row['id'], row['coordinates']])
This would five a final result in a list:
[[23.0, '[1, 2]'], [34.0, '[3, 4]'], [56.0, '[5, 6]'], [nan, '[99, 99]']]

Pandas write back to dataframe after filter and calculation

I have a dataframe data which contains items and price. For each item iteration, I want to calculate normalized price and write it back to the column "Price_Norm". The problem is that only the last iteration has the result and previous iterations are empty.
Below are the codes, use 2 items with 2 price each for simplicity:
d = {
'itemName': ['Cat', 'Cat', 'Dog', 'Dog'],
'Price': [1, 2, 3, 4],
'Price_Norm':[0,0,0,0],
}
data = pd.DataFrame(data=d)
items = ['Cat', 'Dog']
data_c = data.copy()
for this_item in items:
this_item_data = data_c[data_c['itemName']==this_item]
normalized_price = this_item_data['Price'] - this_item_data['Price'].max()
data['Price_Norm'] = normalized_price #<--- FIXME
Each item calculation is fine, the dataframes of normalized_price has correct indices and values. However, after the loop only the last item dog gets the mormalized price, all the previous ones are empty, even default value 0 are now empty.
I think the last line Price_Norm assigment needs to be fixed, it needs to write to matching indices ONLY and not override other rows to empty. What is the correct way doing so?
Try:
data['Price'] = data.groupby('itemName')['Price'].apply(lamda x : x - max(x))

Pandas use cell value as dict key to return dict value

my question relates to using the values in a dataframe column as keys in order to return their respective values and run a conditional.
I have a dataframe, df, containing a column "count" that has integers from 1 to 8 and a column "category" that has values either "A", "B", or "C"
I have a dictionary, dct, containing pairs A:2, B:4, C:6
This is my (incorrect) code:
result = df[df["count"] >= dct.get(df["category"])]
So I want to return a dataframe where the "count" value for a given row is equal to more than the value retrieved from a dictionary using the "category" letter in the same row.
So if there were count values of (1, 2, 6, 6) and category values of (A, B, C, A), the third and forth row would be return in the resultant dataframe.
How do I modify the above code to achieve this?
A good way to go is to add your dictionary into a the existing dataframe and then apply a query on the new dataframe:
import pandas as pd
df = pd.DataFrame(data={'count': [4, 5, 6], 'category': ['A', 'B', 'C']})
dct = {'A':5, 'B':4, 'C':-1}
df['min_count'] = df['category'].map(dct)
df = df.query('count>min_count')
following your logic:
import pandas as pd
dct = {'A':2, 'B':4, 'C':6}
df = pd.DataFrame({'count':[1,2,5,6],
'category':['A','B','C','A']})
print('original dataframe')
print(df)
def process_row(x):
return True if x['count'] >= dct[x['category']] else False
f = df.apply(lambda row: process_row(row), axis=1)
df = df[f]
print('final output')
print(df)
output:
original dataframe
count category
0 1 A
1 2 B
2 5 C
3 6 A
final output
count category
3 6 A
A small modification to your code:
result = df[df['count'] >= df['category'].apply(lambda x: dct[x])]
You cannot directly use dct.get(df['category']) because df['category'] returns a mutable Series which cannot be used as a dictionary key (Dictionary keys need to be immutable objects)
So, apply and lambda to the rescue! :)

python dictionary - list of lists to dict

I'm trying to take list of lists and convert in to dictionary. See code below
yearend = [['empl','rating1','rating2','rating3'],['mike','4','4','5'],
['sam','3','2','5'],['doug','5','5','5']]
extract the employee names
employee = [item[0] for item in yearend] #select 1st item from each list
employee.pop(0) # pop out the empl
print(employee)
### output##################################################
##['mike', 'sam', 'doug']###################################
###Output###################################################
###extract the various rating types
yearend1 = yearend [:] # make a copy
rating = yearend1.pop(0) # Pop out the 1st list
rating.pop(0)
print(rating)
### output##################################################
##['rating1', 'rating2', 'rating3']#########################
###Output###################################################
# pick employee and rating and convert rating to numeric
empl_rating = {t[0]:t[1:] for t in yearend1}
for key,value in empl_rating.items():
value = list(map(int, value))
empl_rating[key] = value
print(empl_rating)
### output##################################################
##{'mike': [4, 4, 5], 'sam': [3, 2, 5], 'doug': [5, 5, 5]}##
###Output###################################################
I extracted the data like above and now Iam trying to put together in to dict (New_dicts) so that when
New_dicts['sam']['rating1']
I get 3 or
New_dicts['doug']['rating3']
I get 5. What I'm struggling is how to put this data together?
def todict(ratings) :
a ={}
a["rating1"] = ratings [0]
a["rating2"] = ratings [1]
a["rating3"] = ratings [2]
return a
One way to solve your problem is to get rid of the first row with the headings then just do:
{item[0] : todict(item[1:])
for item in your_list}
BTW this sol is based of off how you wanted to index it. I'm sure there is a more generic sol out there.
Because what you want is essentially just a nested dict
You can use a dict comprehension:
New_dicts = {line[0]: {yearend[0][i + 1]: int(rating) for i, rating in enumerate(line[1:])} for line in yearend[1:]}

Categories