How to create new columns by dividing all columns in a loop? - python

i'm having a trouble in my code. i just want to create new columns by dividing all columns / survival_time and after i need to add new columns as (clv_mean_xxx) to main dataframe.
here is my code.
list_ib = ['actor_masterylevel', 'churn_yn', 'old_value2_num', 'old_value3_num','old_value4_num', 'time']
for i in list_ib:
for j in list_ib:
if i == j:
break
else:
df = df[i] * df['survival_time']
df['clv_' + str(i) + '_' + str(j)] = df

If I understand the requirement, this should work
for i in list_ib:
df['clv_mean_'+i] = df[i]/df['survival_time']

Related

Best Practice for Adding Lots of Columns to Pandas DataFrame

I am trying to add many columns to a pandas dataframe as follows:
def create_sum_rounds(df, col_name_base):
'''
Create a summed column in df from base columns. For example,
df['sum_foo'] = df['foo_1'] + df['foo_2'] + df['foo_3'] + \
df['foo_4'] + df['foo_5'] +
'''
out_name = 'sum_' + col_name_base
df[out_name] = 0.0
for i in range(1, 6):
col_name = col_name_base + str(i)
if col_name in df:
df[out_name] += df[col_name]
else:
logger.error('Col %s not in df' % col_name)
for col in sum_cols_list:
create_sum_rounds(df, col)
Where sum_cols_list is a list of ~200 base column names (e.g. "foo"), and df is a pandas dataframe which includes the base columns extended with 1 through 5 (e.g. "foo_1", "foo_2", ..., "foo_5").
I'm getting a performance warning when I run this snippet:
PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
I believe this is because creating a new column is actually calling an insert operation behind the scenes. What's the right way to use pd.concat in this case?
You can use your same approach, but instead of operating directly on the DataFrame, you'll need to store each output as its own pd.Series. Then when all of the computations are done, use pd.concat to glue everything back to your original DataFrame.
(untested, but should work)
import pandas as pd
def create_sum_rounds(df, col_name_base):
'''
Create a summed column in df from base columns. For example,
df['sum_foo'] = df['foo_1'] + df['foo_2'] + df['foo_3'] + \
df['foo_4'] + df['foo_5'] +
'''
out = pd.Series(0, name='sum_' + col_name_base, index=df.index)
for i in range(1, 6):
col_name = col_name_base + str(i)
if col_name in df:
out += df[col_name]
else:
logger.error('Col %s not in df' % col_name)
return out
col_sums = []
for col in sum_cols_list:
col_sums.append(create_sum_rounds(df, col))
new_df = pd.concat([df, *col_sums], axis=1)
Additionally, you can simplify your existing code (if you're willing to forego your logging)
import pandas as pd
def create_sum_rounds(df, col_name_base):
'''
Create a summed column in df from base columns. For example,
df['sum_foo'] = df['foo_1'] + df['foo_2'] + df['foo_3'] + \
df['foo_4'] + df['foo_5'] + ...
'''
return df.filter(regex=f'{col_name_base}_\d+').sum(axis=1)
col_sums = []
for col in sum_cols_list:
col_sums.append(create_sum_rounds(df, col))
new_df = pd.concat([df, *col_sums], axis=1)
Simplify :-)
def create_sum_rounds(df, col_name_base):
'''
Create a summed column in df from base columns. For example,
df['sum_foo'] = df['foo_1'] + df['foo_2'] + df['foo_3'] + \
df['foo_4'] + df['foo_5'] +
'''
out_name = 'sum_' + col_name_base
df[out_name] = df.loc[:,[x for x in df.columns if x.startswith(col_name_base)]].sum(axis=1)
Would this get you the results you are expecting?
df = pd.DataFrame({
'Foo_1' : [1, 2, 3, 4, 5],
'Foo_2' : [10, 20, 30, 40, 50],
'Something' : ['A', 'B', 'C', 'D', 'E']
})
df['Foo_Sum'] = df.filter(like = 'Foo_').sum(axis = 1)

Pandas add a new column with a string where the cell match a particular condition

I'm trying to apply Pandas style to my dataset and add a column with a string with the matching result.
This is what I want to achieve:
Link
Below is my code, an expert from stackflow assisted me to apply the df.style so I believe for the df.style is correct based on my test. However, how can I run iterrows() and check the cell for each column and return/store a string to the new column 'check'? Thank you so much. I'm trying to debug but not able to display what I want.
df = pd.DataFrame([[10,3,1], [3,7,2], [2,4,4]], columns=list("ABC"))
df['check'] = None
def highlight(x):
c1 = 'background-color: yellow'
m = pd.concat([(x['A'] > 6), (x['B'] > 2), (x['C'] < 3)], axis=1)
df1 = pd.DataFrame('', index=x.index, columns=x.columns)
return df1.mask(m, c1)
def check(v):
for index, row in v[[A]].iterrows():
if row[A] > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row[A]) + ">6"
return A_check
for index, row in v[[B]].iterrows():
if row[B] > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row[B]) + ">2"
return B_check
for index, row in v[[C]].iterrows():
if row[C] < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row[C]) + "<3"
return C_check
df['check'] = df.apply(lambda v: check(v), axis=1)
df.style.apply(highlight, axis=None)
This is the error message I got:
NameError: name 'A' is not defined
My understanding is that the following produces what you are trying to achieve with the check function:
def check(v):
row_str = 'row:{}, '.format(v.name)
checks = []
if v['A'] > 6:
checks.append(row_str + '{:.1f}'.format(v['A']) + ">6")
if v['B'] > 2:
checks.append(row_str + '{:.1f}'.format(v['B']) + ">2")
if v['C'] < 3:
checks.append(row_str + '{:.1f}'.format(v['C']) + "<3")
return '\n'.join(checks)
df['check'] = df.apply(check, axis=1)
Result (print(df)):
A B C check
0 10 3 1 row:0, 10.0>6\nrow:0, 3.0>2\nrow:0, 1.0<3
1 3 7 2 row:1, 7.0>2\nrow:1, 2.0<3
2 2 4 4 row:2, 4.0>2
(Replace \n with ' ' if you don't want the line breaks in the result.)
The axis=1 option in apply gives the function check one row of df as a Series with the column names of df as index (-> v). With v.name you'll get the corresponding row index. Therefore I don't see the need to use .iter.... Did I miss something?
There are few mistakes in program which we will fix one by one
Import pandas
import pandas as pd
In function check(v): var A, B, C are not defined, replace them with 'A', 'B', 'C'. Then v[['A']] will become a series, and to iterate in series we use iteritems() and not iterrows, and also index will be column name in series. Replacing will give
def check(v):
truth = []
for index, row in v[['A']].iteritems():
if row > 6:
A_check = f'row:{index},' + '{0:.1f}'.format(row) + ">6"
truth.append(A_check)
for index, row in v[['B']].iteritems():
if row > 2:
B_check = f'row:{index}' + '{0:.1f}'.format(row) + ">2"
truth.append(B_check)
for index, row in v[['C']].iteritems():
if row < 3:
C_check = f'row:{index}' + '{0:.1f}'.format(row) + "<3"
truth.append(C_check)
return '\n'.join(truth)
This should give expected output, although you need to also add additional logic so that check column doesnt get yellow color. This answer has minimal changes, but I recommend trying axis=1 to apply style columnwise as it seems more convenient. Also you can refer to style guide

Selecting columns using [[]] is very inefficient especially as the size of the dataset increases in python using pandas

Created sample data using below function:
def create_sample(num_of_rows=1000):
num_of_rows = num_of_rows # number of records to generate.
data = {
'var1' : [random.uniform(0.0, 1.0) for x in range(num_of_rows)],
'other' : [random.uniform(0.0, 1.0) for x in range(num_of_rows)]
}
df = pd.DataFrame(data)
print("Shape : {}".format(df.shape))
print("Type : \n{}".format(df.dtypes))
return df
df = create_sample()
times = []
for i in range(1, 300):
start = time.time()
# Make the dataframe 1 column bigger
df['var' + str(i + 1)] = df['var' + str(i)]
# Select two columns from the dataframe using double square brackets
####################################################
temp = df[['var' + str(i + 1), 'var' + str(i)]]
####################################################
end = time.time()
times.append(end - start)
start = end
plt.plot(times)
print(sum(times))
The graph is linear
enter image description here
used pd.concat to select columns, the graph shows peaks at every 100.. why is this so
df = create_sample()
times = []
for i in range(1, 300):
start = time.time()
# Make the dataframe 1 column bigger
df['var' + str(i + 1)] = df['var' + str(i)]
# Select two columns from the dataframe using double square brackets
####################################################
temp = pd.concat([df['var' + str(i + 1)],df['var' + str(i)]], axis=1)
####################################################
end = time.time()
times.append(end - start)
start = end
plt.plot(times)
print(sum(times))
please ignore indentation.
**From the above we can see that the time taken to select columns using [[]] increases linerly with the size of the dataset.
However, using pd.concat the time does not increase materially. Why increases in every 100 records only. The above is not obvious
**

Is there a way to optimize this code in order to run faster?

Hi there I am working in an application and I am using this piece of code to create new columns in a data frame so I can make some calculations, however it is really slow and I would like to try a new approach.
I have read about Multiprocessing, but I am not sure how and where to use it, so I am asking for your help.
def create_exposed_columns(df):
df['MONTH_INITIAL_DATE'] = df['INITIAL_DATE'].dt.to_period(
'M')
df['MONTH_FINAL_DATE'] = df['FINAL_DATE'].dt.to_period(
'M')
df['Diff'] = df['MONTH_FINAL_DATE'] - df['MONTH_INITIAL_DATE']
list_1 = []
for index, row in df.iterrows():
valor = 1
initial_date = row['INITIAL_DATE']
diff = row['Diff']
temporal_list = {}
list_1.append(temporal_list)
for i in range(meses_iterables + 1):
date = initial_date + relativedelta(months=+1 * i)
if len(str(date.month)) == 1:
value = {str(date.year) + '-0' + str(date.month): valor}
temporal_list.update(value)
else:
value = {str(date.year) + '-' + str(date.month): valor}
temporal_list.update(value)
df_2 = pd.DataFrame(list_1)
df = df.reset_index()
df = pd.concat([df, df_2], axis=1)
return df
I have no idea where to start, so any kind of help will be useful.
Thanks

How to create a new pandas column by scanning cross multiple columns using for-loops?

I have 25 variables DXCODE1 to DXCODE25, which I want to scan across to see if any of these values for each row matches the icd_list. For example, in each row, I want to scan across from DXCODE1 to DXCODE25 and see if any of these contains any one of the following three values: 'F32', 'F33', 'F34', if it does, then I want to return 1. I tried the following:
def scan_icd (row):
icd_list = ['F32', 'F33', 'F34']
for i in range(1, 26):
dx_code_loc = 'DXCODE' + str(i)
for j in range(0, len(icd_list)):
if icd_list[j] in row[dx_code_loc]:
return 1
df['ICD_DX'] = df.apply(scan_icd, axis=1)
But I got this error:
TypeError: ("argument of type 'float' is not iterable", 'occurred at index 1')
Also I would like to make it flexible so I can somehow specify the icd code as a list in the parameter. But I don't know how to apply syntax-wise:
def scan_icd (row, icd_list):
icd_list = icd_list
for i in range(1, 26):
dx_code_loc = 'DXCODE' + str(i)
for j in range(0, len(icd_list)):
if icd_list[j] in row[dx_code_loc]:
return 1
df['ICD_DX'] = df.apply(scan_icd (['F32', 'F33', 'F34']), axis=1)
TypeError: apply() got multiple values for argument 'axis'
===================
Edit:
The columns are labeled DXCODE1,DXCODE2, ... DXCODE25
I think this apply will do the job you want
icd_list = ['F32', 'F33', 'F34']
df['ICD_DX'] = df.apply(lambda row: 1 if row.isin(icd_list).any() else 0, axis=1)
You check if any element of icd_list is in your row
EDIT: if you want to keep your for-loops (sorry I didn't see this requirement at first) I would do:
def scan_icd (row, icd_list):
for i in range(1, 26):
dx_code_loc = 'DXCODE' + str(i)
for j in range(0, len(icd_list)):
if icd_list[j] in row[dx_code_loc]:
return 1
return 0 # return 0 if none match
icd_list = ['F32', 'F33', 'F34']
df['ICD_DX'] = df.apply(scan_icd, args=([icd_list]), axis=1)
# note the list of the list icd_list in args
EDIT 2: to specify the columns, you can do:
list_col = ['DXCODE' + str(i) for i in range(1,26)]
df['ICD_DX'] = df.apply(lambda row: 1 if row[list_col].isin(icd_list).any() else 0, axis=1)
# see the difference is with row[list_col]

Categories