modifying the dataframe column and get unexpected results - python

I have a dataframe listed like below:
There are actually 120000 rows in this data, and there are 20000 users, this is just one user. For every user I need to make sure the prediction is three "1" and three "0".
I wrote the following function to do that:
def check_prediction_quality(df):
df_n = df.copy()
unique = df_n['userID'].unique()
for i in range(len(unique)):
ex_df = df[df['userID']== unique[i]]
v = ex_df['prediction'].tolist()
v_bool = [i == 0 for i in v]
if sum(v_bool) != 3:
if sum(v_bool) > 3:
res = [i for i,val in enumerate(v_bool) if val]
diff = sum(v_bool) - 3
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(1)
res.remove(idx)
elif sum(v_bool) < 3:
res = [i for i,val in enumerate(v_bool) if not val]
diff = 3 - sum(v_bool)
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(0)
res.remove(idx)
for j in range(len(v)):
df_n.loc[(0+i*6)+j:(6+i*6)+j,'prediction'] = v[j]
return df_n
However, when I run to check if the number of "0" and "1" are the same, turns it's not.. I am not sure what I did wrong.
sum([i == 0 for i in df['prediction']])
should be six using the below example, but when I run on my 120000 dataframe, it does not have 60000 on each
data = {'userID': [199810,199810,199810,199810,199810,199810,199812,199812,199812,199812,199812,199812],
'trackID':[1,2,3,4,5,6,7,8,9,10,11,12],
'prediction':[0,0,0,0,1,1,1,1,1,1,0,0]
}
df = pd.DataFrame(data = data)
df
Much appreciated!

When working with pandas dataframes you should reassign the post-processed Dataframe to the old one.
df = pd.DataFrame(np.array(...))
#reasignation:
df.loc[:,3:5] = df.loc[:,3:5]*10 #This multiplies the columns from 3 to 5 by 10

Actually never mind. I found out I don't have to modify the "0" and "1"..

Related

How to create dummy variables from predefined bins in python

I want to create a dataset with dummy variables from the original data based on predefined bins. I have tried using loops and splits but its not efficient. I'll appreciate your help.
## original data
data_dict = {"Age":[29,35,42,11,43],"Salary":[4380,3280,8790,1200,5420],
"Payments":[23190,1780,3400,12900,7822]}
df = pd.DataFrame(data_dict)
df
Predefined bins:
card_dict = {"Dummy Variable":["Age:(-inf,24)","Age:(24,35)","Age:(35,49)","Age:(49,60)","Age:(60,inf)",
"Payments:(-inf,7654)","Payments:(7654,9088)","Payments:(9088,12055)","Payments:(12055,inf)",
"Salary:(-inf,2300)","Salary:(2300,3800)","Salary:(3800,5160)",
"Salary:(5160,7200)","Salary:(7200,inf)"]}
card = pd.DataFrame(card_dict)
card
My code is as follows:
# for numerical variables
def prepare_numerical_data(data, scard):
"""
function to create dummy variables from numerical columns
"""
# numerical columns
num_df = df.select_dtypes(exclude='object')
num_cols = num_df.columns.values
variable_names = list(set([val.split(':')[0] for val in scard['Dummy Variable']])) # to have the same columns used to create the scorecard
num_variables = [x for x in variable_names if x in num_cols] # select numerical variables only
for i in num_variables:
for j in scard['Dummy Variable']:
if j.split(":")[0] in num_variables:
for val in data[i].unique():
if (val > (float(j.split(':')[1].split(',')[0][1:]))) & (val <= (float(j.split(':')[1].split(',')[1][:-1]))):
data.loc[data[i] == val, j] = 1
else:
data.loc[data[i] == val, j] = 0
return data
Here are the results:
result_df = prepare_numerical_data(df,card)
result_df
The results are not OK for salary and payments columns. The function didn't create correct dummies for the two columns as it did for age. How can I correct that?
This worked for me. Initially my code was not looping through every column in the dataframe.
def create_dummies(data, card):
# specify numerical and categorical columns
num_df = data.select_dtypes(exclude='object')
cat_df = data.select_dtypes(exclude=['float','int'])
num_cols = num_df.columns.values
cat_cols = cat_df.columns.values
# create dummies for numerical columns
for j in num_df.columns:
all_value = num_df[j].values
for variable_v in all_value:
for i in card["Dummy Variable"].values:
if i.split(":")[0] in num_cols:
var1 = i.split(":")
val1 = float(var1[1].strip("()").strip("[]").split(",")[0])
val2 = float(var1[1].strip("()").strip("[]").split(",")[1])
variable = var1[0]
if variable.lower() == j.lower():
if variable_v >= val1 and variable_v < val2:
num_df.loc[num_df[j] == variable_v, i] = 1
else:
num_df.loc[num_df[j] == variable_v, i] = 0
return num_df

Remove following rows that are above or under by X amount from the current row['x']

I am calculating correlations and the data frame I have needs to be filtered.
I am looking to remove the rows under the current row from the data frame that are above or under by X amount starting with the first row and looping through the dataframe all the way until the last row.
example:
df['y'] has the values 50,51,52,53,54,55,70,71,72,73,74,75
if X = 10 it would start at 50 and see 51,52,53,54,55 as within that 10+- range and delete the rows. 70 would stay as it is not within that range and the same test would start again at 70 where 71,72,73,74,75 and respective rows would be deleted
the filter if X=10 would thus leave us with the rows including 50,75 for df.
It would leave me with a clean dataframe that deletes the instances that are linked to the first instance of what is essentially the same observed period. I tried coding a loop to do that but I am left with the wrong result and desperate at this point. Hopefully someone can correct the mistake or point me in the right direction.
df6['index'] = df6.index
df6.sort_values('index')
boom = len(dataframe1.index)/3
#Taking initial comparison values from first row
c = df6.iloc[0]['index']
#Including first row in result
filters = [True]
#Skipping first row in comparisons
for index, row in df6.iloc[1:].iterrows():
if c-boom <= row['index'] <= c+boom:
filters.append(False)
else:
filters.append(True)
# Updating values to compare based on latest accepted row
c = row['index']
df2 = df6.loc[filters].sort_values('correlation').drop('index', 1)
df2
OUTPUT BEFORE
OUTPUT AFTER
IIUC, your main issue is to filter consecutive values within a threshold.
You can use a custom function for that that acts on a Series (=column) to return the list of valid indices:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = []
for i, val in s.iteritems():
if val-prev > threshold:
idx.append(i)
prev = val
return idx
Example of use:
import pandas as pd
df = pd.DataFrame({'y': [50,51,52,53,54,55,70,71,72,73,74,75]})
df2 = df.loc[consecutive(df['y'])]
Output:
y
0 50
6 70
variant
If you prefer the function to return a boolean indexer, here is a varient:
def consecutive(s, threshold = 10):
prev = float('-inf')
idx = [False]*len(s)
for i, val in s.iteritems():
if val-prev > threshold:
idx[i] = True
prev = val
return idx

What is the fastest way to manipulate large csv files in Python?

I have been working on a python code, which reads a csv file with 800 odd rows and around 17000 columns.
I would like to check each entry in the csv file and see if this number is bigger than or smaller than a value, if it is, I assign a default value. I used pandas and worked with dataframes, apply and lambda functions. It takes me 172 minutes to finish going through all entries in the csv file. Is it normal? Is there any faster way to do this?. I am using Python 2.7. I don't know if it helps, but I am running it on a windows 10 machine with 32GB ram. Thanks in advance for the help.
The code is attached below.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
modified_dataframe = pd.DataFrame()
for k in col:
temp_data = some_dataframe.apply(lambda x: check_for_range(x[k]), axis=1).tolist()
dictionary = {}
dictionary[str(k)] = temp_data
temp_frame = pd.DataFrame(dictionary)
modified_dataframe = pd.concat([modified_dataframe, temp_frame], axis=1)
return modified_dataframe
def check_for_range(var):
var = int(var)
try:
if var == 0:
return 0
if var == 1 or var == 4:
return 1
if var == 2 or var == 3 or var == 5 or var == 6:
return 2
except:
print('error')
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
df_after_doing_something = do_something(some_dataframe)
df_after_doing_something.to_csv(output_folder + '\\df_after_doing_something.csv', index=False)
using pandas,for cvs data, makes it efficient. but your code is not efficient.it will be faster if you try code given blow.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
col = col.to_numpy()
np_array = np.zeros_like(col)
for i in range(len(col)):
k = np_array[i]
temp_data = np.zeros_like()
temp_data[k == 1 or k == 4] = 1
temp_data[k == 2 or k == 3 or k == 5 or k == 6] = 2
np_array[i] = k
modified_dataframe = pandas.Dataframe(np_array)
return modified_dataframe
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
it will work perfectly and don't forget to import numpy.
import numpy as np
if you didn't get this go and check some numpy tutorial and do it then. the link given below will help you otherwise
Replacing elements in a numpy array when there are multiple conditions

Build table from for loop values

I have a for loop that does calculations from multiple columns in a dataframe with multiple criteria that prints float values I need to arrange in a table.
demolist = ['P13+', 'P18-34']
impcount = ['<1M', '1-5M']
for imp in impcount:
print(imp)
for d in demolist:
print(d)
target_ua = df.loc[(df['target'] == d) & (df['IMP Count'] == imp), 'in_target_ua_digital'].sum()
target_pop = df.loc[(df['target'] == d) & (df['IMP Count'] == imp), 'in_target_pop'].sum()
target_reach = target_ua / target_pop
print(target_reach)
The output looks like this:
<1M
P13+
0.10
P18-34
0.12
1-5M
P13+
0.92
P18-34
0.53
The code is working correctly, but I need the output to be arranged in a new dataframe with impcount in the columns and demolist in the rows
<1M 1-5M
P13+ 0.10 0.92
P18-34 0.12 0.53
It is just a matter of how to arrange your data. A table is a 2D data structure, which is often represented as a list of list (tuple) in python, e.g. [[1,2], [3, 4]]. For your case, you could collect your data row by row to build the table data, meaning that generate a tuple or list for each element of the row, then for the whole row we get a list of list (the table).
Here is an example showing how to form a table when each value of each cell could be calculated (here is a random value)
In [53]: x = list('abc')
...: y = list('123')
...:
...: data=[]
...: for i in x:
...: row=[]
...: for j in y:
...: row.append(np.random.rand())
...: data.append(row)
...:
...: df = pd.DataFrame(data, index=x, columns=y)
...:
In [54]: df
Out[54]:
1 2 3
a 0.107659 0.840387 0.642285
b 0.184508 0.641443 0.475105
c 0.503608 0.379945 0.933735
Try this:
demolist = ['P13+', 'P18-34']
impcount = ['<1M', '1-5M']
imp_str = '\t'
for imp in impcount:
imp_str += imp + '\t'
print(imp_str.rstrip())
imp_counter = 0
for imp in impcount:
demo_str = demolist[imp_counter]+'\t'
for d in demolist:
target_ua = df.loc[(df['target'] == d) & (df['IMP Count'] == imp), 'in_target_ua_digital'].sum()
target_pop = df.loc[(df['target'] == d) & (df['IMP Count'] == imp), 'in_target_pop'].sum()
target_reach = target_ua / target_pop
demo_str += str(target_reach)+'\t'
print(demo_str.rstrip())
imp_counter += 1
Hope this helps!

How to efficiently join a list of values to a list of intervals?

I have a data frame which can be constructed as follows:
df = pd.DataFrame({'value':scipy.stats.norm.rvs(0, 1, size=1000),
'start':np.abs(scipy.stats.norm.rvs(0, 20, size=1000))})
df['end'] = df['start'] + np.abs(scipy.stats.norm.rvs(5, 5, size=1000))
df[:10]
start value end
0 9.521781 -0.570097 17.708335
1 3.929711 -0.927318 15.065047
2 3.990466 0.756413 4.841934
3 20.676291 -1.418172 28.284301
4 13.084246 1.280723 14.121626
5 29.784740 0.236915 32.791751
6 21.626625 1.144663 28.739413
7 18.524309 0.101871 27.271344
8 21.288152 -0.727120 27.049582
9 13.556664 0.713141 22.136275
Each row represents a value assigned to an interval (start, end)
Now, I would like to get a list of best values occuring at time 10,13,15, ... ,70. (It is similar to the geometric index in SQL if you are familiar with that.)
Below is my 1st attempt in python with pandas, it takes 18.5ms. Can any one help to improve it? (This procedure would be called 1M or more times with different data frames in my program)
def get_values(data):
data.sort_index(by='value', ascending=False, inplace=True) # this takes 0.2ms
# can we get rid of it? since we don't really need sort...
# all we need is the max value for each interval.
# But if we have to keep it for simplicity it is ok.
ret = []
#data = data[(data['end'] >= 10) & (data['start'] <= 71)]
for t in range(10, 71, 2):
interval = data[(data['end'] >= t) & (data['start'] <= t)]
if not interval.empty:
ret.append(interval['value'].values[0])
else:
for i in range(t, 71, 2):
ret.append(None)
break
return ret
#%prun -l 10 print get_values(df)
%timeit get_values(df)
The 2nd attemp involves decompose pandas into numpy as much as possible, and it takes around 0.7ms
def get_values(data):
data.sort_index(by='value', ascending=False, inplace=True)
ret = []
df_end = data['end'].values
df_start = data['start'].values
df_value = data['value'].values
for t in range(10, 71, 2):
values = df_value[(df_end >= t) & (df_start <= t)]
if len(values) != 0:
ret.append(values[0])
else:
for i in range(t, 71, 2):
ret.append(None)
break
return ret
#%prun -l 10 print get_values(df)
%timeit get_values(df)
Can we improve further? I guess the next step is algorithm level, both of the above are just naive logic implementations.
I don't understand empty process in your code, here is a faster version if ignore your empty process:
import scipy.stats as stats
import pandas as pd
import numpy as np
df = pd.DataFrame({'value':stats.norm.rvs(0, 1, size=1000),
'start':np.abs(stats.norm.rvs(0, 20, size=1000))})
df['end'] = df['start'] + np.abs(stats.norm.rvs(5, 5, size=1000))
def get_value(df, target):
value = df["value"].values
idx = np.argsort(value)[::-1]
start = df["start"].values[idx]
end = df["end"].values[idx]
value = value[idx]
mask = (target[:, None] >= start[None, :]) & (target[:, None] <= end[None, :])
index = np.argmax(mask, axis=1)
flags = mask[np.arange(len(target)), index]
result = value[index]
result[~flags] = np.nan
return result
get_value(df, np.arange(10, 71, 2))

Categories