Aggregate list dataframe in pandas using dedicated function - python

I have the following dataframe in pandas:
data = {'ID_1': {0: '10A00', 1: '10B00', 2: '20001', 3: '20001'},
'ID_2_LIST': {0: [20009, 30006], 1: [20001, 30006],
2: [30009, 30006], 3: [20001, 30003]},
'ID_OCCURRENCY_LIST': {0: [1, 2], 1: [5, 6], 2: [2, 4], 3: [1, 3]}}
# create df
df = pd.DataFrame(data)
| | ID_1 | ID_2_LIST | ID_OCCURRENCY_LIST |
|---:|:-------|:---------------|:---------------------|
| 0 | 10A00 | [20009, 30006] | [1, 2] |
| 1 | 10B00 | [20001, 30006] | [5, 6] |
| 2 | 20001 | [30009, 30006] | [2, 4] |
| 3 | 20001 | [20001, 30003] | [1, 3] |
I would aggregate by ID_1 field applying an external function (in order to identify similar ID_1, let's say "similarID(ID1,ID2)", which returns ID1 or ID2 according to some internal rules), re-generate the list of ID2 and sum the occurrencies for all the equal ID2.
The outcome should be:
**INDEX ID_1 ID_2_LIST ID_OCCURRENCY_LIST**
0 10A00 [20009,30006,20001] [1, 8, 5]
1 10B00 [20001,30006, 30003,20001] [5, 6, 4, 2]
1 20001 [30009,30006, 20001,30003] [2, 4, 1, 3]
EDIT
The code for the function is the following(s1=first string,c1=second string, p1=similarity percentage l1=confidence level, demeraulevenshtein is a literature function):
def pySimilar(s1,c1,p1,l1):
if s1 is None or c1 is None:
return 0
if len(s1)<=5 or len(c1)<=5:
return 0
s1=s1.strip()
c1=c1.strip()
s=s1
c=c1
if s1[3:len(s1)]==c1[3:len(c1)]:
return 1
if len(s1)>=len(c1):
ITERATIONLENGTH=len(c1)/2
else:
ITERATIONLENGTH=len(s1)/2
if len(s1)>=len(c1):
a=int(len(c1)/2)+1
if s1.find(c1[3:a])<0:
return 0
else:
b=int(len(s1)/2)+1
if c1.find(s1[3:b])<0:
return 0
v=[]
CNT=0
TMP=0
max_res=0
search=s1
while CNT < ITERATIONLENGTH:
TMP=(100-((pyDamerauLevenschtein(s[3:len(s)],c[3:len(c)]))*100)/(len(c)-3)) * ((len(search)-3)/(len(s1)-3))
v.append(TMP)
CNT=CNT+1
if TMP>max_res:
max_res = TMP
#s=s[0:len(s)-CNT]
search=s1[0:len(s1)-CNT]
s=s1[0:len(s1)-CNT]
c=c1[0:len(c1)-CNT]
if ((p1-(l1*p1/100)<=sum(v)/len(v) and sum(v)/len(v)<=p1+(l1*p1/100)) or sum(v)/len(v)>=p1+(l1*p1/100)) :
return 1
else:
return 0
I have implemented a function to be applied in the dataframe but it is very slow:
def aggregateListAndOccurrencies(list1,list2):
final = []
final_cnt = []
output = []
cnt_temp = 0
while list1:
elem = list1.pop(0)
cnt = list2.pop(0)
i=0
cnt_temp = cnt
for item in list1:
if pyMATCHSIMILARPN(elem,item,65,20)==1:
cnt_temp = list2[i]+cnt_temp
list1.pop(i)
list2.pop(i)
i+=1
final.append(elem)
final_cnt.append(cnt_temp)
output.append(final)
output.append(final_cnt)
return output
How could apply this in pandas? Any suggestions?

You can simply do a groupby over your ID_1 and just sum the ID_2_List and ID_OCCURRENCY_LIST columns:
df.groupby('ID_1').agg({'ID_2_LIST': 'sum', 'ID_OCCURRENCY_LIST': 'sum'})
if there's a spicific function you'd like the groupby to work with you can then you can use lambda to add it in the .agg:
df.groupby('ID_1').agg({'ID_2_LIST': 'sum', 'ID_OCCURRENCY_LIST': lambda x: ' '.join(x)})

Related

How to iterate over previous rows in a dataframe

I have three columns: id (non-unique id), X (categories) and Y (categories). (I don't have a dataset to share yet. I'll try to replicate what I have using a smaller dataset and edit as soon as possible)
I ran a for loop on a very small subset and based on those results it might take over 4 hours to run this code. I'm looking for a faster way to do this task using pandas (maybe using iterrows, like iterating over previous rows within apply)
For each row I check
whether the current X matches any of previous Xs (check_X = X[:row] == X[row])
whether the current Y matches any of previous Ys (check_Y = Y[:row] == Y[row])
whether the current id does not match any of previous ids (check_id = id[:row] != id[row])
if sum(check_X & check_Y & check_id)>0: then append 1 to the array
else: append 0
Your are probably looking for duplicated:
df = pd.DataFrame({'id': [0, 0, 0, 1, 0],
'X': [1, 1, 2, 1, 1],
'Y': [2, 2, 2, 2, 2]})
df['dup'] = ~df[df.duplicated(['X', 'Y'])].duplicated('id', keep=False).loc[lambda x: ~x]
df['dup'] = df['dup'].fillna(False).astype(int)
print(df)
# Output
id X Y dup
0 0 1 2 0
1 0 1 2 0
2 0 2 2 0
3 1 1 2 1
4 0 1 2 0
Update
X and Y should be checked separately:
df = pd.DataFrame({'id': [0, 1, 1, 2, 2, 3, 4],
'X': [0, 1, 1, 1, 1, 1, 1],
'Y': [0, 2, 2, 2, 2, 2, 2]})
df['dup'] = np.where(df['X'].duplicated() & df['Y'].duplicated() & ~df['id'].duplicated(), 1, 0)
print(df)
# Output
id X Y dup
0 0 0 0 0
1 1 1 2 0
2 1 1 2 0
3 2 1 2 1
4 2 1 2 0
5 3 1 2 1
6 4 1 2 1
EDIT answer from #Corralien using duplicates() will likely be much faster and the best answer for this specific problem. However, apply is more flexible if you have different things to check.
You could do it with iterrows() or apply(). As far as I know apply() is faster:
check_id, check_x, check_y = set(), set(), set()
def apply_func(row):
global check_id, check_x, check_y
if row["id"] not in check_id and row['x'] in check_x and row['y'] in check_y:
row['duplicate'] = 1
else:
row['duplicate'] = 0
check_id.add(row['id'])
check_x.add(row['x'])
check_y.add(row['y'])
return row
df.apply(apply_func, axis=1)
With iterrows():
check_id, check_x, check_y = set(), set(), set()
for i, row in df.iterrows():
if row["id"] not in check_id and row['x'] in check_x and row['y'] in check_y:
df.loc[i, 'duplicate'] = 1
else:
df.loc[i, 'duplicate'] = 0
check_id.add(row['id'])
check_x.add(row['x'])
check_y.add(row['y'])

Print item of list and another item of list

Suppose I have two lists:
list1=[1, 2, 2, 12, 23]
list2=[2, 3, 5, 3, 4]
I want to arrange them side by side list1 and list2 using python
so:
| list1 | list2 |
|---|---|
| 1 | 2 |
| 2 | 3 |
| 2 | 5 |
| 12 | 3 |
| 23 | 4 |
but I want to remove the third row (2,5) and make it:
| list1 | list2 |
|---|---|
| 1 | 2 |
| 2 | 3,5 |
| 12 | 3 |
| 23 | 4 |
Using itertools.groupby:
from itertools import groupby
list1=[1, 2, 2, 12, 23]
list2=[2, 3, 5, 3, 4]
for key, value in groupby(zip(list1, list2), key=lambda x: x[0]):
print(f"{key} {','.join(str(y) for x, y in value)}")
or if you prefer using collections.defaultdict:
from collections import defaultdict
list1=[1,2,2,12,23]
list2=[2, 3, 5, 3, 4]
result = defaultdict(list)
for key, value in zip(list1, list2):
result[key].append(value)
for key, value in result.items():
print(f"{key} {','.join(map(str, value))}")
using only loops and built-in functions:
list1=[1,2,2,12,23]
list2=[2, 3, 5, 3, 4]
result={}
for key, value in zip(list1, list2):
result.setdefault(key, []).append(value)
for key, value in result.items():
print(f"{key} {','.join(map(str, value))}")
In all three cases the output is
1 2
2 3,5
12 3
23 4
You could zip the two lists together and append them to a dictionary. If one value of list 1 already exists as a key in the dictionary, you format the value as a string and append the next one to the same key. I just posted this since it might be easier to follow, however i really like the answer above from #buran.
list1 = [1, 2, 2, 12, 23]
list2 = [2, 3, 5, 3, 4]
combo_dict = {}
# zip both lists together and iterate over them
for key, val in zip(list1, list2):
"""if an item in list1 already exists in combo_dict,
format the respective overlapping value in list2 as a
string and append to the same key to the dict"""
if key in combo_dict.keys():
combo_dict[key] = ",".join([f"{combo_dict[key]}", f"{val}"])
"""if not, simply append the value of list 2 with the key of
list 1 to the dict (could also format this to a string)"""
else:
combo_dict[key] = val
# this is just for printing the dict to the console
no_out=[print(f"{key} {val}") for key, val in combo_dict.items()]

Creating a value matrix in python

I have a dataset as follow
d = {'dist': [100, 200, 200, 400],'id': [1, 2, 3, 4]}
df = pd.DataFrame(data= d)
I would like to create a value matrix around the id
with the calcul : dist(id1) - dist(id2)
null | 1 | 2 | 3 | 4
1 | 0 | 100 | 100 | 300
2 |-100 | 0 | 0 | 200
3 |-100 | 0 | 0 | 200
4 |-300 |-200 |-200 | 0
Any advices will be appreciated
(Edit) Here's the simplified version via the beauty of numpy:
import numpy as np
d = {'dist': [100, 200, 200, 400],'id': [1, 2, 3, 4]}
a = np.array(d['dist']).reshape(1,-1)
b = np.array(a).reshape(-1,1)
# the solution
print a-b
# [[ 0 100 100 300]
# [-100 0 0 200]
# [-100 0 0 200]
# [-300 -200 -200 0]]
(Old Answer) You can do it with a little matrix algebra:
import numpy as np
d = {'dist': [100, 200, 200, 400],'id': [1, 2, 3, 4]}
a = np.array(d['dist']).reshape(1,-1)
b = np.array(a).reshape(-1,1)
# some matrix algebra
c = b.dot(a)
e = c/a
f = c/b
# the solution
print f-e
# [[ 0 100 100 300]
# [-100 0 0 200]
# [-100 0 0 200]
# [-300 -200 -200 0]]
I'm not familiar with numpy, but you could create the matrix, given the existing data structure, using this mildly complicated dictionary comprehension:
matrix = {id: {v: d.get("dist")[i] - d.get("dist")[j] for j, v in enumerate(d.get("id"))} for i, id in enumerate(d.get("id"))}
Keys of the matrix are the columns, and keys of each column are the rows. You could probably write this in a much neater fashion, but this a built-ins only answer that conforms to your request.

Counting choice streaks in Python

I have a dataset that looks like the following:
Subject | Session | Trial | Choice
--------+---------+-------+-------
1 | 1 | 1 | A
1 | 1 | 2 | B
1 | 1 | 3 | B
1 | 1 | 4 | B
1 | 1 | 5 | B
1 | 1 | 6 | A
2 | 1 | 1 | A
2 | 1 | 2 | A
2 | 1 | 3 | A
I would like to use a Python script to generate the following table:
Subject | Session | streak_count
--------+---------+-------------
1 | 1 | 3
2 | 1 | 1
Where streak_count is a count of the total number of choice streaks made by a given subject during a given session, and a streak is any number of choices of one particular item in a row (>0).
I've tried using some of the suggestions to similar questions here, but I'm having trouble figuring out how to count these instances, rather than measure their length, etc., which seem to be more common queries.
def count():
love = []
love1 = []
streak = -1
k = 0
session = 1
subject = raw_input("What is your subject? ")
trials = raw_input("How many trials do you wish to do? ")
trial = 0
for i in range(int(trials)):
choice = raw_input("What was the choice? ")
love.append(choice)
love1.append(choice)
trial += 1
print subject, trial, choice
if love[i] == love1[i-1]:
streak += 1
print subject, session, streak
This may be what you want it takes in how many trials you wish to do and whatever your subject is and if there is a streak it adds one. The reason streak starts at -1 is because when you put your first answer it adds one because of the negative index going back to its self.
I think this is what you are asking for;
import itertools
data = [
[1, 1, 1, 'A'],
[1, 1, 2, 'B'],
[1, 1, 3, 'B'],
[1, 1, 4, 'B'],
[1, 1, 5, 'B'],
[1, 1, 6, 'A'],
[2, 1, 1, 'A'],
[2, 1, 2, 'A'],
[2, 1, 3, 'A']
]
grouped = itertools.groupby(data, lambda x: x[0])
results = dict()
this, last = None, None
for key, group in grouped:
results[key] = 0
for c, d in enumerate(group):
this = d
streak = c == 0 or this[3] != last[3]]
if streak:
results[key] += 1
last = this
print results
This yields;
{1: 3, 2: 1}

Dynamically filtering a pandas dataframe

I am trying to filter a pandas data frame using thresholds for three columns
import pandas as pd
df = pd.DataFrame({"A" : [6, 2, 10, -5, 3],
"B" : [2, 5, 3, 2, 6],
"C" : [-5, 2, 1, 8, 2]})
df = df.loc[(df.A > 0) & (df.B > 2) & (df.C > -1)].reset_index(drop = True)
df
A B C
0 2 5 2
1 10 3 1
2 3 6 2
However, I want to do this inside a function where the names of the columns and their thresholds are given to me in a dictionary. Here's my first try that works ok. Essentially I am putting the filter inside cond variable and just run it:
df = pd.DataFrame({"A" : [6, 2, 10, -5, 3],
"B" : [2, 5, 3, 2, 6],
"C" : [-5, 2, 1, 8, 2]})
limits_dic = {"A" : 0, "B" : 2, "C" : -1}
cond = "df = df.loc["
for key in limits_dic.keys():
cond += "(df." + key + " > " + str(limits_dic[key])+ ") & "
cond = cond[:-2] + "].reset_index(drop = True)"
exec(cond)
df
A B C
0 2 5 2
1 10 3 1
2 3 6 2
Now, finally I put everything inside a function and it stops working (perhaps exec function does not like to be used inside a function!):
df = pd.DataFrame({"A" : [6, 2, 10, -5, 3],
"B" : [2, 5, 3, 2, 6],
"C" : [-5, 2, 1, 8, 2]})
limits_dic = {"A" : 0, "B" : 2, "C" : -1}
def filtering(df, limits_dic):
cond = "df = df.loc["
for key in limits_dic.keys():
cond += "(df." + key + " > " + str(limits_dic[key])+ ") & "
cond = cond[:-2] + "].reset_index(drop = True)"
exec(cond)
return(df)
df = filtering(df, limits_dic)
df
A B C
0 6 2 -5
1 2 5 2
2 10 3 1
3 -5 2 8
4 3 6 2
I know that exec function acts differently when used inside a function but was not sure how to address the problem. Also, I am wondering there must be a more elegant way to define a function to do the filtering given two input: 1)df and 2)limits_dic = {"A" : 0, "B" : 2, "C" : -1}. I would appreciate any thoughts on this.
If you're trying to build a dynamic query, there are easier ways. Here's one using a list comprehension and str.join:
query = ' & '.join(['{}>{}'.format(k, v) for k, v in limits_dic.items()])
Or, using f-strings with python-3.6+,
query = ' & '.join([f'{k}>{v}' for k, v in limits_dic.items()])
print(query)
'A>0 & C>-1 & B>2'
Pass the query string to df.query, it's meant for this very purpose:
out = df.query(query)
print(out)
A B C
1 2 5 2
2 10 3 1
4 3 6 2
What if my column names have whitespace, or other weird characters?
From pandas 0.25, you can wrap your column name in backticks so this works:
query = ' & '.join([f'`{k}`>{v}' for k, v in limits_dic.items()])
See this Stack Overflow post for more.
You could also use df.eval if you want to obtain a boolean mask for your query, and then indexing becomes straightforward after that:
mask = df.eval(query)
print(mask)
0 False
1 True
2 True
3 False
4 True
dtype: bool
out = df[mask]
print(out)
A B C
1 2 5 2
2 10 3 1
4 3 6 2
String Data
If you need to query columns that use string data, the code above will need a slight modification.
Consider (data from this answer):
df = pd.DataFrame({'gender':list('MMMFFF'),
'height':[4,5,4,5,5,4],
'age':[70,80,90,40,2,3]})
print (df)
gender height age
0 M 4 70
1 M 5 80
2 M 4 90
3 F 5 40
4 F 5 2
5 F 4 3
And a list of columns, operators, and values:
column = ['height', 'age', 'gender']
equal = ['>', '>', '==']
condition = [1.68, 20, 'F']
The appropriate modification here is:
query = ' & '.join(f'{i} {j} {repr(k)}' for i, j, k in zip(column, equal, condition))
df.query(query)
age gender height
3 40 F 5
For information on the pd.eval() family of functions, their features and use cases, please visit Dynamic Expression Evaluation in pandas using pd.eval().
An alternative to #coldspeed 's version:
conditions = None
for key, val in limit_dic.items():
cond = df[key] > val
if conditions is None:
conditions = cond
else:
conditions = conditions & cond
print(df[conditions])
An alternative to both posted, that may or may not be more pythonic:
import pandas as pd
import operator
from functools import reduce
df = pd.DataFrame({"A": [6, 2, 10, -5, 3],
"B": [2, 5, 3, 2, 6],
"C": [-5, 2, 1, 8, 2]})
limits_dic = {"A": 0, "B": 2, "C": -1}
# equiv to [df['A'] > 0, df['B'] > 2 ...]
loc_elements = [df[key] > val for key, val in limits_dic.items()]
df = df.loc[reduce(operator.and_, loc_elements)]
How I do this without creating a string and df.query:
limits_dic = {"A" : 0, "B" : 2, "C" : -1}
cond = None
# Build the conjunction one clause at a time
for key, val in limits_dic.items():
if cond is None:
cond = df[key] > val
else:
cond = cond & (df[key] > val)
df.loc[cond]
A B C
0 2 5 2
1 10 3 1
2 3 6 2
Note the hard coded (>, &) operators (since I wanted to follow your example exactly).

Categories