Progressive value collection within a group in pandas - python

I have some data similar to:
#Simulate some data
d = {
"id": [1,1,1,1,1,2,2,2,2],
"action_order": [1,2,3,4,5,1,2,3,4],
"n_actions": [5,5,5,5,5,4,4,4,4],
"seed": ['1','2','3','4','5','10','11','12','13'],
"time_spent": [0.3,0.4,0.5,0.6,0.7,10.1,11.1,12.1,13.1]
}
data = pd.DataFrame(d)
I need a function that for each row will return the values from two columns (seed and time_spent) in that row AND ALL PREVIOUS ROWS within the group as a dictionary. I have attempted to use the apply function as follows but the results are not quite what I need.
data \
.groupby(["profile_id"])[["artist_seed", "tlh"]] \
.apply(lambda x: dict(zip(x["artist_seed"], x["tlh"]))) \
.tolist()
data \
.groupby("profile_id")[["artist_seed", "tlh", "action_order"]] \
.apply(lambda x: dict(zip(list(x["artist_seed"]), list(x["tlh"]))))
The new DataFrame should look like this:
id new_col
0 1 {u'1': 0.3}
1 1 {u'1': 0.3, u'2': 0.4}
2 1 {u'1': 0.3, u'3': 0.5, u'2': 0.4}
...

You can keep a running dict and just return a copy of the most recent version on each apply iteration, per group:
def wrapper(g):
cumdict = {}
return g.apply(update_cumdict, args=(cumdict,), axis=1)
def update_cumdict(row, cd):
cd[row.seed] = row.time_spent
return cd.copy()
data["new_col"] = data.groupby("id").apply(wrapper).reset_index()[0]
data.new_col
0 {'1': 0.3}
1 {'1': 0.3, '2': 0.4}
2 {'1': 0.3, '2': 0.4, '3': 0.5}
3 {'1': 0.3, '2': 0.4, '3': 0.5, '4': 0.6}
4 {'1': 0.3, '2': 0.4, '3': 0.5, '4': 0.6, '5': ...
5 {'10': 10.1}
6 {'10': 10.1, '11': 11.1}
7 {'10': 10.1, '11': 11.1, '12': 12.1}
8 {'10': 10.1, '11': 11.1, '12': 12.1, '13': 13.1}
Name: new_col, dtype: object

How about this.
In [15]: data.groupby(['id']).apply(lambda d: pd.Series(np.arange(len(d))).apply(lambda x: d[['seed', 'time_spent']].iloc[:x+1].to_dict()))
Out[15]:
id
1 0 {'seed': {0: '1'}, 'time_spent': {0: 0.3}}
1 {'seed': {0: '1', 1: '2'}, 'time_spent': {0: 0...
2 {'seed': {0: '1', 1: '2', 2: '3'}, 'time_spent...
3 {'seed': {0: '1', 1: '2', 2: '3', 3: '4'}, 'ti...
4 {'seed': {0: '1', 1: '2', 2: '3', 3: '4', 4: '...
2 0 {'seed': {5: '10'}, 'time_spent': {5: 10.1}}
1 {'seed': {5: '10', 6: '11'}, 'time_spent': {5:...
2 {'seed': {5: '10', 6: '11', 7: '12'}, 'time_sp...
3 {'seed': {5: '10', 6: '11', 7: '12', 8: '13'},...
dtype: object
additionally, you can modify the parameter of .to_dict() method to change the output dict style, refer to: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_dict.html
or maybe this is what you want:
In [18]: data.groupby(['id']).apply(lambda d: pd.Series(np.arange(len(d))).apply(lambda x: dict(zip(d['seed'].iloc[:x+1], d['time_spent'].iloc[:x+1]))))
Out[18]:
id
1 0 {'1': 0.3}
1 {'1': 0.3, '2': 0.4}
2 {'1': 0.3, '2': 0.4, '3': 0.5}
3 {'1': 0.3, '2': 0.4, '3': 0.5, '4': 0.6}
4 {'1': 0.3, '2': 0.4, '3': 0.5, '4': 0.6, '5': ...
2 0 {'10': 10.1}
1 {'10': 10.1, '11': 11.1}
2 {'10': 10.1, '11': 11.1, '12': 12.1}
3 {'10': 10.1, '11': 11.1, '12': 12.1, '13': 13.1}
dtype: object

Related

how to apply a check on below scenarios using if else conditions?

I have a dataframe like this -Please refer the dataframe as in the image shown
There are four columns('status','preferred_time','history','id'), need to check if all the columns have some values in it or not, in the history column ,its a nested list in some cases, so need to specially check nested list has all mandatory keys 'branch','rank','discharge_status','service_start',job_code','post_intention' have values in it, and add a column named "output" in the dataframe if all the columns have values then name that as "completed" else "pending" if blank or NaN or [{}] in any column or history column has any missing key value pair.
From the image , only the first row should be in completed state rest should fall in pending.
Please help me out in building better if else situation here in this scenario.
Thanks in advance.
Dict of above df image -
{'status': {0: 'No', 1: 'No', 2: nan, 3: 'No', 4: 'No'},
'preferred_time': {0: "['Morning', 'Midday', 'Afternoon']",
1: [],
2: "['Morning'] ",
3: nan,
4: "['Morning', 'Midday'] "},
'history': {0: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1999-02-13', 'job_code': '09', 'post_intention': ['No']}]",
1: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1999-02-13', 'job_code': '09', 'post_intention': ['No']}]",
2: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1995-02-13', 'job_code': '09', 'post_intention': ['No']},{'branch': 'A', 'rank': 'E6', 'discharge_status': 'Honorable Discharge', 'service_start': '2015-02-13', 'job_code': '09'}]",
3: nan,
4: '[{}]'},
'id': {0: 1, 1: 5, 2: 2, 3: 3, 4: 4}}
I tried below lines of code -
But I don't know how to check all the four columns in a single if statement -
for i in df.index:
status = df['status'][i]
preferred_time = df['preferred_time'][i]
id = df['id'][i]
history = df['history'][i]
if status and preferred_time and id and status!='' and preferred_time!= '' and id!='':
enroll_status = "completed"
else:
enroll_status = "pending"
if history!= '' or str(history)!= '[{}]':
for item in history:
if 'branch' in item.keys() and'rank' in item.keys() and'discharge_status' in item.keys() and'service_start' in item.keys() and 'job_code' in item.keys() and 'post_intention' in item.keys():
enroll_status = "completed"
else:
enroll_status = "pending"
Consider the following:
import numpy as np
import pandas as pd
from numpy import nan
def check_list(L):
if not isinstance(L,list):
return False
return all(k in d for k in keys_req for d in L)
labels = np.array(["pending","completed"])
keys_req = ['branch','rank','discharge_status','service_start','job_code','post_intention']
d = {'status': {0: 'No', 1: 'No', 2: nan, 3: 'No', 4: 'No'}, 'preferred_time': {0: "['Morning', 'Midday', 'Afternoon']", 1: nan, 2: "['Morning'] ", 3: nan, 4: "['Morning', 'Midday'] "}, 'history': {0: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1999-02-13', 'job_code': '09', 'post_intention': ['No']}]", 1: nan, 2: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1995-02-13', 'job_code': '09', 'post_intention': ['No']},{'branch': 'A', 'rank': 'E6', 'discharge_status': 'Honorable Discharge', 'service_start': '2015-02-13', 'job_code': '09'}]", 3: nan, 4: '[{}]'}, 'id': {0: 1, 1: 5, 2: 2, 3: 3, 4: 4}}
df = pd.DataFrame(d)
df['history_list'] = df['history'].apply(lambda x: eval(x) if isinstance(x,str) else x)
df['mandatory_keys'] = df['history_list'].apply(check_list)
df['no_nans'] = ~pd.isna(df).any(axis = 1)
df['output_tf'] = df['mandatory_keys'] & df['no_nans']
df['output'] = labels[df['output_tf'].to_numpy(dtype=int)]
Note that I corrected some typos from your dataframe in my copied version of the dictionary d (for example, 'rank:'E7' was replaced with 'rank':'E7'). The incremental columns added (history_list, mandatory_keys, no_nans, output_tf) are there to make understanding the process that I applied here easier; it is not actually necessary to add these to the dataframe if, for example, you want to use as little space as possible. The script above results in the following dataframe df:
status preferred_time \
0 No ['Morning', 'Midday', 'Afternoon']
1 No NaN
2 NaN ['Morning']
3 No NaN
4 No ['Morning', 'Midday']
history id \
0 [{'branch': 'A', 'rank': 'E7', 'discharge_stat... 1
1 NaN 5
2 [{'branch': 'A', 'rank': 'E7', 'discharge_stat... 2
3 NaN 3
4 [{}] 4
history_list mandatory_keys no_nans \
0 [{'branch': 'A', 'rank': 'E7', 'discharge_stat... True True
1 NaN False False
2 [{'branch': 'A', 'rank': 'E7', 'discharge_stat... False False
3 NaN False False
4 [{}] False True
output_tf output
0 True completed
1 False pending
2 False pending
3 False pending
4 False pending
Here's a more parsimonious version (which doesn't add the unnecessary columns or store the extra "labels" variable).
import numpy as np
import pandas as pd
from numpy import nan
def check_list(L):
if not isinstance(L,list):
return False
return all(k in d for k in keys_req for d in L)
keys_req = ['branch','rank','discharge_status','service_start','job_code','post_intention']
d = {'status': {0: 'No', 1: 'No', 2: nan, 3: 'No', 4: 'No'}, 'preferred_time': {0: "['Morning', 'Midday', 'Afternoon']", 1: nan, 2: "['Morning'] ", 3: nan, 4: "['Morning', 'Midday'] "}, 'history': {0: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1999-02-13', 'job_code': '09', 'post_intention': ['No']}]", 1: nan, 2: "[{'branch': 'A', 'rank': 'E7', 'discharge_status': 'Honorable Discharge', 'service_start': '1995-02-13', 'job_code': '09', 'post_intention': ['No']},{'branch': 'A', 'rank': 'E6', 'discharge_status': 'Honorable Discharge', 'service_start': '2015-02-13', 'job_code': '09'}]", 3: nan, 4: '[{}]'}, 'id': {0: 1, 1: 5, 2: 2, 3: 3, 4: 4}}
df = pd.DataFrame(d)
df['output'] = np.array(["pending","completed"])[
(df['history'].apply(lambda x: eval(x) if isinstance(x,str) else x)
.apply(check_list)
& ~pd.isna(df).any(axis = 1)
).to_numpy(dtype=int)]
A version that addresses your latest comment:
df = pd.DataFrame(d)
display(df)
df['output'] = np.array(["pending","completed"])[
(df['history'].apply(lambda x: eval(x) if isinstance(x,str) else x)
.apply(check_list)
& ~pd.isna(df).any(axis = 1)
& (df['preferred_time']!="[]")
).to_numpy(dtype=int)]

Create nested dictionaries of arbitrary length

I have the following problem. I have a nested dictionary like the following
A = {'A':{'STATES':['0','1','2']}, 'B':{'STATES':['10','20']}}
What I want to build eventually is a nested dictionary with all the possible combinations of STATES. So for a known number of keys in A is trivial as this
_dict = {}
for s in A['A']['STATES']:
if s not in _dict:
_dict[s] = {}
for s1 in A['B']['STATES']:
_dict[s][s1] = 0
This gives
{'0': {'10': 0, '20': 0}, '1': {'10': 0, '20': 0}, '2': {'10': 0, '20': 0}}
which is what I want. However I do not know the number of keys in A beforehand. What it would be an efficient solution to to the same with an arbitrary number of elements in A?
EDIT
For instance with three elements I would have
{'0': {'10': {'100':0}, '20': {'100':0}, '1': {'10': {'100':0}, '20': {'100':0}, '2': {'10': {'100':0}, '20': {'100':0}}
This problem is a little complex, but it can be splitted up in three parts:
Parse all the values, mapping a list to every valid key.
Get the list of all the combinations in the order of dictionary insertion.
Translate the list of tuples into a nested dictionary, looping over the values inside the tuple itself - because we don't know its length.
import itertools
A = {'A':{'STATES':['0','1','2']}, 'B':{'STATES':['10','20']}, 'C':{'STATES':['100']}}
# 1. get your dictionary A, but reduced, so that
# for every key you have a list of states if the key "STATES" exists
Ared = {k: A[k]["STATES"] for k in A if A[k].get("STATES")}
print(Ared) # {'A': ['0', '1', '2'], 'B': ['10', '20'], 'C': ['100']}
# 2. get all the combinations
combs = list(itertools.product(*Ared.values()))
print(combs) # [('0', '10', '100'), ('0', '20', '100'), ('1', '10', '100'), ('1', '20', '100'), ('2', '10', '100'), ('2', '20', '100')]
# 3. translate them into a nested dictionary
d = dict()
for comb in combs:
old_dict = d
for i, key in enumerate(comb):
if i == len(comb) - 1:
old_dict[key] = 0
elif not old_dict.get(key):
old_dict[key] = {}
old_dict = old_dict[key]
print(d) # {'0': {'10': {'100': 0}, '20': {'100': 0}}, '1': {'10': {'100': 0}, '20': {'100': 0}}, '2': {'10': {'100': 0}, '20': {'100': 0}}}
You can use recursion:
A = {'A':{'STATES':['0','1','2']}, 'B':{'STATES':['10','20']}}
def combos(d):
return 0 if not d else {i:combos(d[1:]) for i in d[0]}
print(combos([j['STATES'] for j in A.values()]))
Output:
{'0': {'10': 0, '20': 0}, '1': {'10': 0, '20': 0}, '2': {'10': 0, '20': 0}}
With more than two keys:
A = {'A':{'STATES':['0','1','2']}, 'B':{'STATES':['10','20']}, 'C':{'STATES':['100']}}
print(combos([j['STATES'] for j in A.values()]))
Output:
{'0': {'10': {'100': 0}, '20': {'100': 0}}, '1': {'10': {'100': 0}, '20': {'100': 0}}, '2': {'10': {'100': 0}, '20': {'100': 0}}}

Merge list of dict and add new values

I need merge 2 list of dicts by conditions without using pandas
x1 - fact values
x1 = [{'id': '94ffe1d6-0afa-11ec-b139-4cd98f4d62e3',
'fact_group': '0.05',
'probability': 0.2},
{'id': '86ae0229-0af8-11ec-be8c-4cd98f847094',
'fact_group': '0.05',
'probability': 0.56},
{'id': '867ef7ac-0af8-11ec-be8c-4cd98f847094',
'fact_group': '0.2',
'probability': 0.31},
{'id': '211bc00c-0af6-11ec-b139-4cd98f4d62e3',
'fact_group': '0.2',
'probability': 0.96}]
x2 - list of dict with intervals probability to labels point
x2 = [{'group': 0.05,
'predict_labels': 0,
'predict_intervals_min': 0.00,
'predict_intervals_max': 0.6},
{'group': 0.05,
'predict_labels': 1,
'predict_intervals_min': 0.6,
'predict_intervals_max': 1.0},
{'group': 0.2,
'predict_labels': 2,
'predict_intervals_min': 0.0,
'predict_intervals_max': 0.45},
{'group': 0.2,
'predict_labels': 3,
'predict_intervals_min': 0.45,
'predict_intervals_max': 1.0}]
I need to merge them by x1['fact_group'] & x2['group']
and x1['probability']>=x2['predict_intervals_min']
and x1['probability']<x2['predict_intervals_max']
Expected: update x1 by x2['predict_labels'] by thresholds and group
x3 = [{'id': '94ffe1d6-0afa-11ec-b139-4cd98f4d62e3',
'predict_labels': 1,
'fact_group': '0.05',
'predict_labels': 0,
'probability': 0.2},
{'id': '86ae0229-0af8-11ec-be8c-4cd98f847094',
'predict_labels': 1,
'fact_group': '0.05',
'predict_labels': 0,
'probability': 0.56},
{'id': '867ef7ac-0af8-11ec-be8c-4cd98f847094',
'fact_group': '0.2',
'predict_labels': 2,
'probability': 0.31},
{'id': '211bc00c-0af6-11ec-b139-4cd98f4d62e3',
'fact_group': '0.2',
'predict_labels': 3,
'probability': 0.96}]
I suggest you this simple solution:
result = []
for y1 in x1:
for y2 in x2:
if (y1['fact_group'] == str(y2['group'])) and \
(y1['probability'] >= y2['predict_intervals_min']) and \
(y1['probability'] < y2['predict_intervals_max']):
d = y1
d['predict_labels'] = y2['predict_labels']
result.append(d)
print(result)

Count values in dict of pandas series

I have a pandas series of dicts like this:
print(df['genres'])
0 {'0': '1', '1': '4', '2': '23'}
1 {'0': '1', '1': '25', '2': '4', '3': '37'}
2 {'0': '9'}
print(type(df['genres']))
<class 'pandas.core.series.Series'>
print(type(df['genres'][0]))
<class 'dict'>
I want to count the values to get something like this:
{'1': 2, '4': 2, '9': 1, '23': 1, '25': 1, '37': 1}
I tried the following:
print(Counter(chain.from_iterable(df.genres.values)))
Counter({'0': 3, '1': 2, '2': 2, '3': 1})
print(pd.Series(df['genres']).value_counts())
{'0': '1', '1': '4', '2': '23'} 1
{'0': '1', '1': '25', '2': '4', '3': '37'} 1
{'0': '9'} 1
I think it is pretty easy for someone more experienced than me. But I really don't get it ...
Try:
pd.DataFrame(list(df.genres)).stack().value_counts().to_dict()
Output:
{'1': 2, '4': 2, '37': 1, '9': 1, '23': 1, '25': 1}

How to remove some values from a python dictionary

I have this python dictionary myDict:
{'Age': {0: '39'}, 'DailyRate': {0: '903'}, 'DistanceFromHome': {0: '2'}, 'EnvironmentSatisfaction': {0: '1'}, 'HourlyRate': {0: '41'}, 'JobInvolvement': {0: '4'}, 'JobLevel': {0: '3'}, 'JobSatisfaction': {0: '3'}, 'MonthlyIncome': {0: '7880'}, 'MonthlyRate': {0: '2560'}, 'NumCompaniesWorked': {0: '0'}, 'PercentSalaryHike': {0: '18'}, 'RelationshipSatisfaction': {0: '4'}, 'StandardHours': {0: '80'}, 'TotalWorkingYears': {0: '9'}, 'TrainingTimesLastYear': {0: '3'}, 'YearsAtCompany': {0: '8'}, 'YearsInCurrentRole': {0: '7'}, 'YearsSinceLastPromotion': {0: '0'}, 'YearsWithCurrManager': {0: '7'}, 'MaritalStatus_': {0: '2'}, 'JobRole_': {0: '7'}, 'Gender_': {0: '1'}, 'EducationField_': {0: '1'}, 'Department_': {0: '2'}, 'BusinessTravel_': {0: '2'}, 'OverTime_': {0: '1'}, 'Over18_': {0: '1'}}
As you can see, if i get a one from above sample as below,
{'Age': {0: '39'}}
There is an additional 0 in front of the value 39. And this zero presents in every key-value pair.
How can I get rid of this 0, so it looks like this:
{'Age': '39'}
I tried this method, but it removes the whole key instead of the 0:
map(myDict.pop, ['Age',''])
Can someone please help me?
You can use dictionary comprehension to solve this issue. Try doing:
new_dict = {key: value[0] for key, value in old_dict.items()}
Here, you iterate through each key, value pair in the dictiory and assign the key of the new dictionary to the key of the old dictionary. But the value becomes the 0th key value of the dictionary inside the dictionary.
For an example, the key starts at 'Age', so the first key of the new dictionary is 'Age'. The value however is {0: '39'}[0] which is '39'. So the first element of the dictionary is 'Age': '39'
you can read it by the following code:
dict= {'Age': {0: '39'}, 'DailyRate': {0: '903'}, 'DistanceFromHome': {0: '2'}, 'EnvironmentSatisfaction': {0: '1'}, 'HourlyRate': {0: '41'}, 'JobInvolvement': {0: '4'}, 'JobLevel': {0: '3'}, 'JobSatisfaction': {0: '3'}, 'MonthlyIncome': {0: '7880'}, 'MonthlyRate': {0: '2560'}, 'NumCompaniesWorked': {0: '0'}, 'PercentSalaryHike': {0: '18'}, 'RelationshipSatisfaction': {0: '4'}, 'StandardHours': {0: '80'}, 'TotalWorkingYears': {0: '9'}, 'TrainingTimesLastYear': {0: '3'}, 'YearsAtCompany': {0: '8'}, 'YearsInCurrentRole': {0: '7'}, 'YearsSinceLastPromotion': {0: '0'}, 'YearsWithCurrManager': {0: '7'}, 'MaritalStatus_': {0: '2'}, 'JobRole_': {0: '7'}, 'Gender_': {0: '1'}, 'EducationField_': {0: '1'}, 'Department_': {0: '2'}, 'BusinessTravel_': {0: '2'}, 'OverTime_': {0: '1'}, 'Over18_': {0: '1'}}
print(dict['Age'][0])
to convert just do:
age = dict['Age'][0]
del(dict['Age'])
dict.update({"Age":age})
you will get the following dic as result:
{'DailyRate': {0: '903'}, 'DistanceFromHome': {0: '2'}, 'EnvironmentSatisfaction': {0: '1'}, 'HourlyRate': {0: '41'}, 'JobInvolvement': {0: '4'}, 'JobLevel': {0: '3'}, 'JobSatisfaction': {0: '3'}, 'MonthlyIncome': {0: '7880'}, 'MonthlyRate': {0: '2560'}, 'NumCompaniesWorked': {0: '0'}, 'PercentSalaryHike': {0: '18'}, 'RelationshipSatisfaction': {0: '4'}, 'StandardHours': {0: '80'}, 'TotalWorkingYears': {0: '9'}, 'TrainingTimesLastYear': {0: '3'}, 'YearsAtCompany': {0: '8'}, 'YearsInCurrentRole': {0: '7'}, 'YearsSinceLastPromotion': {0: '0'}, 'YearsWithCurrManager': {0: '7'}, 'MaritalStatus_': {0: '2'}, 'JobRole_': {0: '7'}, 'Gender_': {0: '1'}, 'EducationField_': {0: '1'}, 'Department_': {0: '2'}, 'BusinessTravel_': {0: '2'}, 'OverTime_': {0: '1'}, 'Over18_': {0: '1'}, 'Age': '39'}
Maybe try the following code:
keys = myDict.keys()
valuesWithZero = myDict.values()
valuesNoZero = []
for item in valuesWithZero:
value_iterator = iter(item.values()) #to make dict_values obj iterable
first_value = next(value_iterator) #obtaining first value
valuesNoZero.append(first_value) #adding to new list
newDict = dict(zip(keys, valuesNoZero)) #combining keys arr and values arr
print(newDict)
# should output: {'Age': '39', 'DailyRate': '903', 'DistanceFromHome': '2', 'EnvironmentSatisfaction': '1', 'HourlyRate': '41', 'JobInvolvement': '4', 'JobLevel': '3', 'JobSatisfaction': '3', 'MonthlyIncome': '7880', 'MonthlyRate': '2560', 'NumCompaniesWorked': '0', 'PercentSalaryHike': '18', 'RelationshipSatisfaction': '4', 'StandardHours': '80', 'TotalWorkingYears': '9', 'TrainingTimesLastYear': '3', 'YearsAtCompany': '8', 'YearsInCurrentRole': '7', 'YearsSinceLastPromotion': '0', 'YearsWithCurrManager': '7', 'MaritalStatus_': '2', 'JobRole_': '7', 'Gender_': '1', 'EducationField_': '1', 'Department_': '2', 'BusinessTravel_': '2', 'OverTime_': '1', 'Over18_': '1'}
Same process as the accepted answer but uses some of Python's functional features.
import operator
zero = operator.itemgetter(0)
newdict = dict(zip(myDict,map(zero, myDict.values())))

Categories