I have a multiple dataframes which are similar to below:
df:
Name Value1 Value2
A 98 57
B 267 962
C 43 423
D 612 34
I need to use a function on the above datframe which will perform some calculations and output some variables.
def my_func()
c001=[]
for _, value in df.iterrows():
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1>0 else 'under'
kpi = 'YYT'
c001.append(f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) ")
c001[1]
How do I use this function on the input dataframe to print the value in variable c001[1]?
I hope I understood you correctly:
def my_func()
c001=[]
for _, value in df.iterrows():
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1>0 else 'under'
kpi = 'YYT'
c001.append(f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) ")
return c001[1]
print (my_func())
You can try to create "c001" as a column and then print it.
def my_func(value):
var1 = value['Value1']
var2 = value['Value1%']
seg1 = value['Name']
flag1 = 'over' if var1 > 0 else 'under'
kpi = 'YYT'
return f"{seg1} {kpi} {flag1} Plan by {human(var1)}({abs(var2)}%) "
df["c001"] = df.apply(my_func, axis=1)
print(df["c001"])
The result will look like:
0 A YYT over Plan by 98(57%)
1 B YYT over Plan by 267(962%)
2 C YYT over Plan by 43(423%)
3 D YYT over Plan by 612(34%)
Name: c001, dtype: object
Related
I want to create a dataset with dummy variables from the original data based on predefined bins. I have tried using loops and splits but its not efficient. I'll appreciate your help.
## original data
data_dict = {"Age":[29,35,42,11,43],"Salary":[4380,3280,8790,1200,5420],
"Payments":[23190,1780,3400,12900,7822]}
df = pd.DataFrame(data_dict)
df
Predefined bins:
card_dict = {"Dummy Variable":["Age:(-inf,24)","Age:(24,35)","Age:(35,49)","Age:(49,60)","Age:(60,inf)",
"Payments:(-inf,7654)","Payments:(7654,9088)","Payments:(9088,12055)","Payments:(12055,inf)",
"Salary:(-inf,2300)","Salary:(2300,3800)","Salary:(3800,5160)",
"Salary:(5160,7200)","Salary:(7200,inf)"]}
card = pd.DataFrame(card_dict)
card
My code is as follows:
# for numerical variables
def prepare_numerical_data(data, scard):
"""
function to create dummy variables from numerical columns
"""
# numerical columns
num_df = df.select_dtypes(exclude='object')
num_cols = num_df.columns.values
variable_names = list(set([val.split(':')[0] for val in scard['Dummy Variable']])) # to have the same columns used to create the scorecard
num_variables = [x for x in variable_names if x in num_cols] # select numerical variables only
for i in num_variables:
for j in scard['Dummy Variable']:
if j.split(":")[0] in num_variables:
for val in data[i].unique():
if (val > (float(j.split(':')[1].split(',')[0][1:]))) & (val <= (float(j.split(':')[1].split(',')[1][:-1]))):
data.loc[data[i] == val, j] = 1
else:
data.loc[data[i] == val, j] = 0
return data
Here are the results:
result_df = prepare_numerical_data(df,card)
result_df
The results are not OK for salary and payments columns. The function didn't create correct dummies for the two columns as it did for age. How can I correct that?
This worked for me. Initially my code was not looping through every column in the dataframe.
def create_dummies(data, card):
# specify numerical and categorical columns
num_df = data.select_dtypes(exclude='object')
cat_df = data.select_dtypes(exclude=['float','int'])
num_cols = num_df.columns.values
cat_cols = cat_df.columns.values
# create dummies for numerical columns
for j in num_df.columns:
all_value = num_df[j].values
for variable_v in all_value:
for i in card["Dummy Variable"].values:
if i.split(":")[0] in num_cols:
var1 = i.split(":")
val1 = float(var1[1].strip("()").strip("[]").split(",")[0])
val2 = float(var1[1].strip("()").strip("[]").split(",")[1])
variable = var1[0]
if variable.lower() == j.lower():
if variable_v >= val1 and variable_v < val2:
num_df.loc[num_df[j] == variable_v, i] = 1
else:
num_df.loc[num_df[j] == variable_v, i] = 0
return num_df
Say I have some dataframe like below and I create a new column (track_len) which gives the length of the column track_no.
import pandas as pd
df = pd.DataFrame({'item_id': [1,2,3], 'track_no': ['qwerty23', 'poiu2', 'poiuyt5']})
df['track_len'] = df['track_no'].str.len()
df.head()
My Question is:
How do I now create a new column (new_col) which selects a specific subset of the track_no string and outputs that depending on the length of the track number (track_len).
I have tried creating a function which outputs the specific string slice of the track_no given the various track_len conditions and then use an apply method to create the column and it doesnt work. The code is below:
Tried:
def f(row):
if row['track_len'] == 8:
val = row['track_no'].str[0:3]
elif row['track_len'] == 5:
val = row['track_no'].str[0:1]
elif row['track_len'] =7:
val = row['track_no'].str[0:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
Thus the desired output should be (based on string slicing output of f):
Output
{new_col: ['qwe', 'p', 'po']}
If there are alternative better solutions to this problem those would also be appreciated.
Your function works well you need to remove .str part in your if blocks. Values are already strings:
def f(row):
if row['track_len'] == 8:
val = row['track_no'][:3]
elif row['track_len'] == 5:
val = row['track_no'][:1]
elif row['track_len'] ==7:
val = row['track_no'][:2]
return val
df['new_col'] = df.apply(f, axis=1)
df.head()
#Output:
item_id track_no track_len new_col
0 1 qwerty23 8 qwe
1 2 poiu2 5 p
2 3 poiuyt5 7 po
I am trying to assign a value to a dataframe column based on a value that falls IN BETWEEN two values of an other dataframe:
intervals = pd.DataFrame(columns = ['From','To','Value'], data = [[0,100,'A'],[100,200,'B'],[200,500,'C']])
print('intervals\n',intervals,'\n')
points = pd.DataFrame(columns = ['Point', 'Value'], data = [[45,'X'],[125,'X'],[145,'X'],[345,'X']])
print('points\n',points,'\n')
DesiredResult = pd.DataFrame(columns = ['Point', 'Value'], data = [[45,'A'],[125,'B'],[145,'B'],[345,'C']])
print('DesiredResult\n',DesiredResult,'\n')
Many thanks
Let's use map, first create a series using pd.IntervalIndex with from_arrays method:
intervals = intervals.set_index(pd.IntervalIndex.from_arrays(intervals['From'],
intervals['To']))['Value']
points['Value'] = points['Point'].map(intervals)
Output:
Point Value
0 45 A
1 125 B
2 145 B
3 345 C
Another approach:
def calculate_value(x):
return intervals.loc[(x >= intervals['From']) & (x < intervals['To']), 'Value'].squeeze()
desired_result = points.copy()
desired_result['Value'] = desired_result['Point'].apply(calculate_value)
I want to add a new column to my exisitng dataframe.
I am doing this,
def test(self, sess, df):
for index, frame in df.iterrows():
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
frame['medical_plan_id'] = list(medical_plan)[0]
df = df.append(frame)
print df
The df before the loop is ,
wellthie_issuer_identifier ... service_area_id
0 UHC99806 ... 1
[1 rows x 106 columns]
Normally the column and data should be added to this row. But I am getting 2 rows instead and only the last loop value inserted.
df after the loop, column is getting created but the data is wrong.
wellthie_issuer_identifier ... medical_plan_id
0 UHC99806 ... NaN
0 UHC99806 ... 879519.0
[2 rows x 107 columns]
How do I achieve this.
Output I should get as below-
wellthie_issuer_identifier ... service_area_id medical_plan_id
0 UHC99806 ... 1 879519.0
[1 rows x 107 columns]
try 1:
I called the get_id method like below -
def test(self, sess, df):
print ("in test", df)
for index, frame in df.iterrows():
id = self.get_id(sess, frame)
df['medical_plan_id'] = df.apply(id, axis=1)
print df
def test(self, sess, df):
def get_id(frame):
medical_plan = sess.query(MedicalPlan.id).filter(MedicalPlan.issuer_id == frame['issuer_id'],
MedicalPlan.hios_plan_identifier == frame['hios_plan_identifier'],
MedicalPlan.plan_year == frame['plan_year'],
MedicalPlan.group_or_individual_plan_type == frame['group_or_individual_plan_type']).first()
sess.commit()
return list(medical_plan)[0]
df['medical_plan_id']=df.apply(get_id, axis =1)
print(df)
If you want medical_plan_id to be an int, you can change the last line of get_id to return int(list(medical_plan)[0]). Also, you probably could do
medical_plan = sess.query(MedicalPlan.id).filter(
all([MedicalPlan.attribute == frame.attribute for attribute in
['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']])).first()
or
attributes = ['issuer_id','hios_plan_identifier','plan_year','group_or_individual_plan_type']
medical_plan = sess.query(MedicalPlan.id).filter(all(MedicalPlan[attributes]==frame[attributes])).first())
(I can't say for certain whether that will work without knowing what kind of object MedicalPlan is.)
I have a training data like below which have all the information under a single column. The data set has above 300000 data.
id features label
1 name=John Matthew;age=25;1.=Post Graduate;2.=Football Player; 1
2 name=Mark clark;age=21;1.=Under Graduate;Interest=Video Games; 1
3 name=David;age=12;1:=High School;2:=Cricketer;native=america; 2
4 name=George;age=11;1:=High School;2:=Carpenter;married=yes 2
.
.
300000 name=Kevin;age=16;1:=High School;2:=Driver;Smoker=No 3
Now i need to convert this training data like below
id name age 1 2 Interest married Smoker
1 John Matthew 25 Post Graduate Football Player Nan Nan Nan
2 Mark clark 21 Under Graduate Nan Video Games Nan Nan
.
.
Is there any efficient way to do this. I tried the below code but it took 3 hours to complete
#Getting the proper features from the features column
cols = {}
for choices in set_label:
collection_list = []
array = train["features"][train["label"] == choices].values
for i in range(1,len(array)):
var_split = array[i].split(";")
try :
d = (dict(s.split('=') for s in var_split))
for x in d.keys():
collection_list.append(x)
except ValueError:
Error = ValueError
count = Counter(collection_list)
for k , v in count.most_common(5):
key = k.replace(":","").replace(" ","_").lower()
cols[key] = v
columns_add = list(cols.keys())
train = train.reindex(columns = np.append( train.columns.values, columns_add))
print (train.columns)
print (train.shape)
#Adding the values for the newly created problem
for row in train.itertuples():
dummy_dic = {}
new_dict={}
value = train.loc[row.Index, 'features']
v_split = value.split(";")
try :
dummy_dict = (dict(s.split('=') for s in v_split))
for k, v in dummy_dict.items():
new_key = k.replace(":","").replace(" ","_").lower()
new_dict[new_key] = v
except ValueError:
Error = ValueError
for k,v in new_dict.items():
if k in train.columns:
train.loc[row.Index, k] = v
Is there any useful function that i can apply here for efficient way of feature extraction ?
Create two DataFrames (in the first one all the features are the same for every data point and the second one is a modification of the first one introducing different features for some data points) meeting your criteria:
import pandas as pd
import numpy as np
import random
import time
import itertools
# Create a DataFrame where all the keys for each datapoint in the "features" column are the same.
num = 300000
NAMES = ['John', 'Mark', 'David', 'George', 'Kevin']
AGES = [25, 21, 12, 11, 16]
FEATURES1 = ['Post Graduate', 'Under Graduate', 'High School']
FEATURES2 = ['Football Player', 'Cricketer', 'Carpenter', 'Driver']
LABELS = [1, 2, 3]
df = pd.DataFrame()
df.loc[:num, 0]= ["name={0};age={1};feature1={2};feature2={3}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
FEATURES1[np.random.randint(0, len(FEATURES1))],\
FEATURES2[np.random.randint(0, len(FEATURES2))]) for i in xrange(num)]
df['label'] = [LABELS[np.random.randint(0, len(LABELS))] for i in range(num)]
df.rename(columns={0:"features"}, inplace=True)
print df.head(20)
# Create a modified sample DataFrame from the previous one, where not all the keys are the same for each data point.
mod_df = df
random_positions1 = random.sample(xrange(10), 5)
random_positions2 = random.sample(xrange(11, 20), 5)
INTERESTS = ['Basketball', 'Golf', 'Rugby']
SMOKING = ['Yes', 'No']
mod_df.loc[random_positions1, 'features'] = ["name={0};age={1};interest={2}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
INTERESTS[np.random.randint(0, len(INTERESTS))]) for i in xrange(len(random_positions1))]
mod_df.loc[random_positions2, 'features'] = ["name={0};age={1};smoking={2}"\
.format(NAMES[np.random.randint(0, len(NAMES))],\
AGES[np.random.randint(0, len(AGES))],\
SMOKING[np.random.randint(0, len(SMOKING))]) for i in xrange(len(random_positions2))]
print mod_df.head(20)
Assume that your original data is stored in a DataFrame called df.
Solution 1 (all the features are the same for every data point).
def func2(y):
lista = y.split('=')
value = lista[1]
return value
def function(x):
lista = x.split(';')
array = [func2(i) for i in lista]
return array
# Calculate the execution time
start = time.time()
array = pd.Series(df.features.apply(function)).tolist()
new_df = df.from_records(array, columns=['name', 'age', '1', '2'])
end = time.time()
new_df
print 'Total time:', end - start
Total time: 1.80923295021
Edit: The one thing you need to do is to edit accordingly the columns list.
Solution 2 (The features might be the same or different for every data point).
import pandas as pd
import numpy as np
import time
import itertools
# The following functions are meant to extract the keys from each row, which are going to be used as columns.
def extract_key(x):
return x.split('=')[0]
def def_columns(x):
lista = x.split(';')
keys = [extract_key(i) for i in lista]
return keys
df = mod_df
columns = pd.Series(df.features.apply(def_columns)).tolist()
flattened_columns = list(itertools.chain(*columns))
flattened_columns = np.unique(np.array(flattened_columns)).tolist()
flattened_columns
# This function turns each row from the original dataframe into a dictionary.
def function(x):
lista = x.split(';')
dict_ = {}
for i in lista:
key, val = i.split('=')
dict_[key ] = val
return dict_
df.features.apply(function)
arr = pd.Series(df.features.apply(function)).tolist()
pd.DataFrame.from_dict(arr)
Suppose your data is like this :
features= ["name=John Matthew;age=25;1:=Post Graduate;2:=Football Player;",
'name=Mark clark;age=21;1:=Under Graduate;2:=Football Player;',
"name=David;age=12;1:=High School;2:=Cricketer;",
"name=George;age=11;1:=High School;2:=Carpenter;",
'name=Kevin;age=16;1:=High School;2:=Driver; ']
df = pd.DataFrame({'features': features})
I will start by this answer and try to replace all separator (name, age , 1:= , 2:= ) by ;
with this function
def replace_feature(x):
for r in (("name=", ";"), (";age=", ";"), (';1:=', ';'), (';2:=', ";")):
x = x.replace(*r)
x = x.split(';')
return x
df = df.assign(features= df.features.apply(replace_feature))
After applying that function to your df all the values will a list of features. where you can get each one by index
then I use 4 customs function to get each attribute name, age, grade; job,
Note: There can be a better way to do this by using only one function
def get_name(df):
return df['features'][1]
def get_age(df):
return df['features'][2]
def get_grade(df):
return df['features'][3]
def get_job(df):
return df['features'][4]
And finaly applying that function to your dataframe :
df = df.assign(name = df.apply(get_name, axis=1),
age = df.apply(get_age, axis=1),
grade = df.apply(get_grade, axis=1),
job = df.apply(get_job, axis=1))
Hope this will be quick and fast
As far as I understand your code, the poor performances comes from the fact that you create the dataframe element by element. It's better to create the whole dataframe at once whith a list of dictionnaries.
Let's recreate your input dataframe :
from StringIO import StringIO
data=StringIO("""id features label
1 name=John Matthew;age=25;1.=Post Graduate;2.=Football Player; 1
2 name=Mark clark;age=21;1.=Under Graduate;2.=Football Player; 1
3 name=David;age=12;1:=High School;2:=Cricketer; 2
4 name=George;age=11;1:=High School;2:=Carpenter; 2""")
df=pd.read_table(data,sep=r'\s{3,}',engine='python')
we can check :
print df
id features label
0 1 name=John Matthew;age=25;1.=Post Graduate;2.=F... 1
1 2 name=Mark clark;age=21;1.=Under Graduate;2.=Fo... 1
2 3 name=David;age=12;1:=High School;2:=Cricketer; 2
3 4 name=George;age=11;1:=High School;2:=Carpenter; 2
Now we can create the needed list of dictionnaries with the following code :
feat=[]
for line in df['features']:
line=line.replace(':','.')
lsp=line.split(';')[:-1]
feat.append(dict([elt.split('=') for elt in lsp]))
And the resulting dataframe :
print pd.DataFrame(feat)
1. 2. age name
0 Post Graduate Football Player 25 John Matthew
1 Under Graduate Football Player 21 Mark clark
2 High School Cricketer 12 David
3 High School Carpenter 11 George