list=[['10, 0.01, 0.0428, 120; 30, 0.1, 2, 33; 50, 0.023, 0.31, 0.65'],
['10, 0.7, 0.5428, 2.31'],
['50, 0.3, 0.35, 0.1'],
['-10, 0.2, 0.048, 124; -30, 0.11, 24, 3; -50, 0.02, 0.1, 0.60; 0, 0, 0, 0; 10, 0.1, 2, 33;
20, 0.023, 0.31, 0.66']]
df=pd.DataFrame(list)
I have a dataframe df from which I am trying to get the 3rd value after each semicolon sign if the column name matches with the 1st value after the semicolon sign. The expected output is as below. Any clue on how to tackle this in a simple way?
Use nested loops:
d = {}
for r, i in enumerate(l):
for j in i[0].split(';'):
k = j.split(',')
c, v = int(k[0]), float(k[2])
d[(r, c)] = v
df = pd.Series(d).unstack(fill_value=0)
Output:
>>> df
-50 -30 -10 0 10 20 30 50
0 0.0 0.0 0.000 0.0 0.0428 0.00 2.0 0.31
1 0.0 0.0 0.000 0.0 0.5428 0.00 0.0 0.00
2 0.0 0.0 0.000 0.0 0.0000 0.00 0.0 0.35
3 0.1 24.0 0.048 0.0 2.0000 0.31 0.0 0.00
Related
I have the following pandas dataframe:
df = pd.DataFrame({'pred': [1, 2, 3, 4],
'a': [0.4, 0.6, 0.35, 0.5],
'b': [0.2, 0.4, 0.32, 0.1],
'c': [0.1, 0, 0.2, 0.2],
'd': [0.3, 0, 0.1, 0.2]})
I want to change values on 'pred' column, based on columns a,b,c,d , as following:
if a has the value at column a is larger than the values of column b,c,d
and
if one of columns - b , c or d has value larger than 0.25
then change value in 'pred' to 0. so the results should be:
pred a b c d
0 1 0.4 0.2 0.1 0.1
1 0 0.6 0.4 0.0 0.0
2 0 0.35 0.32 0.2 0.3
3 4 0.5 0.1 0.2 0.2
How can I do this?
Create a boolean condition/mask then use loc to set value to 0 where condition is True
cols = ['b', 'c', 'd']
mask = df[cols].lt(df['a'], axis=0).all(1) & df[cols].gt(.25).any(1)
df.loc[mask, 'pred'] = 0
pred a b c d
0 1 0.40 0.20 0.1 0.1
1 0 0.60 0.40 0.0 0.0
2 0 0.35 0.32 0.2 0.3
3 4 0.50 0.10 0.2 0.2
import pandas as pd
def row_cond(row):
m_val = max(row[2:])
if row[1]>m_val and m_val>0.25:
row[0] = 0
return row
df = pd.DataFrame({'pred': [1, 2, 3, 4],
'a': [0.4, 0.6, 0.35, 0.5],
'b': [0.2, 0.4, 0.32, 0.1],
'c': [0.1, 0, 0.2, 0.2],
'd': [0.1, 0, 0.3, 0.2]})
new_df = df.apply(row_cond,axis=1)
Output:
pred a b c d
0 1.0 0.40 0.20 0.1 0.1
1 0.0 0.60 0.40 0.0 0.0
2 0.0 0.35 0.32 0.2 0.3
3 4.0 0.50 0.10 0.2 0.2
I have the following DataFrame:
df = pd.DataFrame({'a': [0.28, 0, 0.25, 0.85, 0.1],
'b': [0.5, 0.5, 0, 0.75, 0.1],
'c': [0.33, 0.7, 0.25, 0.2, 0.5],
'd': [0, 0.25, 0.2, 0.66, 0.1]})
Output:
a b c d
0 0.28 0.50 0.33 0.00
1 0.00 0.50 0.70 0.25
2 0.25 0.00 0.25 0.20
3 0.85 0.75 0.20 0.66
4 0.10 0.10 0.50 0.10
For each column, I want to sum the top n max values of the column, where n is determined by how many row max values that column contains.
For example, column b has a row max only in row 1, so its sum is the sum of the top 1 max values in that column, which is just 0.5 -- but column c has three row-maxes, located in rows 1, 2, and 4, so the top 3 max values of column c should be summed.
Expected output:
a b c d
0 0.28 0.50 0.33 0.00
1 0.00 0.50 0.70 0.25
2 0.25 0.00 0.25 0.20
3 0.85 0.75 0.20 0.66
4 0.10 0.10 0.50 0.10
count 1.10 0.50 1.45 0.00
where
df.append(
df.where( # only look at values that are max for the row
df.eq( # compare max values to all values in row just
# in case there are more than 1
df.max(axis=1), # actually get max values
axis=0
)
).sum().rename('count')
)
a b c d
0 0.28 0.50 0.33 0.00
1 0.00 0.50 0.70 0.25
2 0.25 0.00 0.25 0.20
3 0.85 0.75 0.20 0.66
4 0.10 0.10 0.50 0.10
count 1.10 0.50 1.45 0.00
The fastest way to do this would be to using the .max() method passing through the axis argument:
df.max(axis =1)
If you're after another column:
df['column_name'] = df.max(axis =1)
I didn't read the question that well!
I try to calculate cumulative returns over for example 3 days and assign the end value to a new DataFrame cumulative_return. To illustrate that I created this example:
returns:
01K 02K 03K 04K
Dates
2022-01-01 1.02 1.05 1.02 1.01
2022-01-02 0.97 0.99 1.02 1.06
2022-01-03 1.03 1.07 0.98 1.02
2022-01-04 0.96 1.02 1.03 0.98
2022-01-05 1.02 1.02 1.09 1.03
2022-01-06 1.06 0.95 0.96 0.99
start:
01K 02K 03K 04K
Dates
2022-01-01 1 0 0 0
2022-01-02 0 1 0 0
2022-01-03 0 0 0 0
2021-01-04 1 0 1 0
2021-01-05 0 0 1 0
2021-01-06 0 0 0 1
cumulative_returns:
01K 02K 03K. 04K
Dates
2022-01-01 1.019 0.00 0.000 0.0
2022-01-02 0.000 1.08 0.000 0.0
2022-01-03 0.000 0.00 0.000 0.0
2022-01-04 1.038 0.00 1.078 0.0
2022-01-05 0.000 0.00 NaN 0.0
2021-01-06 0.000 0.00 0.000 NaN
An example for the cumulative_return calculation over 3 days:
2022-01-01/01K: 1.019 = 1.02 * 0.97 * 1.03
2022-01-04/01K: 1.038 = 0.96 * 1.02 * 1.06
So far I was only able to calculate the cumulative return by individually shifting the returns, which is not efficient for longer time intervals.
returns.shift(-2) * returns.shift(-1) * returns * start
For reproducibility:
import pandas as pd
import numpy as np
returns = pd.DataFrame({
'Dates':['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06'],
'01K':[0.02, -0.03, 0.03, -0.04, 0.02, 0.06],
'02K':[0.05, -0.01, 0.07, 0.02, 0.02, -0.05],
'03K':[0.02, 0.02, -0.02, 0.03, 0.09, -0.04],
'04K':[0.01, 0.06, 0.02, -0.02, 0.03, -0.01]})
returns = returns.set_index('Dates')
returns = returns + 1
start = pd.DataFrame({
'Dates':['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05', '2022-01-06'],
'01K':[1, 0, 0, 1, 0, 0],
'02K':[0, 1, 0, 0, 0, 0],
'03K':[0, 0, 0, 1, 1, 0],
'04K':[0, 0, 0, 0, 0, 1]})
start = start.set_index('Dates')
you can leverage pandas rolling function with numpy product function to get the following:
n=3
returns.rolling(n).apply(np.prod).shift(-n+1)
output:
01K 02K 03K 04K
Dates
2022-01-01 1.019082 1.112265 1.019592 1.092012
2022-01-02 0.959136 1.080486 1.029588 1.059576
2022-01-03 1.008576 1.113228 1.100246 1.029588
2022-01-04 1.037952 0.988380 1.077792 0.999306
2022-01-05 NaN NaN NaN NaN
2022-01-06 NaN NaN NaN NaN
I have two DataFrames; returns and weights and I try to combine them to a floating_weights DataFrame. The idea behind this is that I want to dynamically adjust the weights in period t with the return in period t-1. So the weights increase if there are positive returns and vice versa.
Here I created a simple example:
weights:
Dates 01K W 02K W 03K W 04K W
0 2021-01-01 0.0 0.2 0.3 0.5
1 2021-01-02 0.0 0.2 0.3 0.5
2 2021-01-03 0.5 0.2 0.3 0.0
3 2021-01-04 0.5 0.2 0.3 0.0
4 2021-01-05 0.5 0.0 0.2 0.3
5 2021-01-06 0.5 0.0 0.2 0.3
returns:
Dates 01K W 02K W 03K W 04K W
0 2021-01-01 0.01 0.01 -0.03 0.05
1 2021-01-02 -0.02 0.02 0.04 -0.02
2 2021-01-03 0.03 -0.03 0.01 -0.02
3 2021-01-04 -0.03 0.01 0.02 0.01
4 2021-01-05 0.02 0.02 0.01 0.01
5 2021-01-06 0.01 -0.01 0.03 0.02
The floating_weights DataFrame is based on the normal weights adjusted with the returns from the previous period:
floating_weights (2021-01-01, 02K W): 0.2 (start with normal weight)
floating_weights (2021-01-02, 02K W): 0.202 = 0.2 * (1+0.01)
floating_weights (2021-01-03, 02K W): 0.206 = 0.2 * (1+0.01) * (1+0.02)
floating_weights (2021-01-04, 02K W): 0.19986 = 0.2 * (1+0.01) * (1+0.02) * (1-0.03)
The floating_weights would look like this.
Dates 01K W 02K W 03K W 04K W
0 2021-01-01 0.0000 0.20000 0.30000 0.500
1 2021-01-02 0.0000 0.20200 0.29100 0.525
2 2021-01-03 0.5000 0.20604 0.30264 0.000
3 2021-01-04 0.5150 0.19986 0.30567 0.000
4 2021-01-05 0.4995 0.00000 0.20785 0.300
5 2021-01-06 0.5095 0.00000 0.20993 0.303
For reproducibility:
import pandas as pd
returns = pd.DataFrame({
'Dates':['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06'],
'01K W':[0.01, -0.2, 0.03, -0.03, 0.02, 0.01],
'02K W':[0.01, 0.02, -0.03, 0.01, 0.02, -0.01],
'03K W':[-0.03, 0.04, 0.01, 0.02, 0.01, 0.03],
'04K W':[0.05, -0.02, -0.02, 0.01, 0.01, 0.02]})
returns = returns.set_index('Dates')
weights = pd.DataFrame({
'Dates':['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06'],
'01K W':[0, 0, 0.5, 0.5, 0.5, 0.5],
'02K W':[0.2, 0.2, 0.2, 0.2, 0, 0],
'03K W':[0.3, 0.3, 0.3, 0.3, 0.2, 0.2],
'04K W':[0.5, 0.5, 0, 0, 0.3, 0.3]})
weights = weights.set_index('Dates')
Thank you very much for the help!
We can use cumprod to calculate the cumulative returns, then shift and multiply the cumulative returns with the weights dataframe to get the desired result
r = returns.add(1).cumprod().shift()
floating_weights = weights.mul(r, fill_value=1)
If you want to reset the cumprod every time the weight is assigned to zero, in such case we have to consider each column separately
floating_weights = weights.copy()
for col in weights:
g = weights[col].eq(0).cumsum()
r = returns[col].add(1).groupby(g).cumprod()
floating_weights[col] = weights[col].mul(r.shift(1), fill_value=1)
>>> floating_weights
01K W 02K W 03K W 04K W
Dates
2021-01-01 0.000000 0.200000 0.300000 0.500
2021-01-02 0.000000 0.202000 0.291000 0.525
2021-01-03 0.500000 0.206040 0.302640 0.000
2021-01-04 0.515000 0.199859 0.305666 0.000
2021-01-05 0.499550 0.000000 0.207853 0.300
2021-01-06 0.509541 0.000000 0.209932 0.303
I have some DataFrame:
df = pd.DataFrame({'name': ['apple1', 'apple2', 'apple3', 'apple4', 'orange1', 'orange2', 'orange3', 'orange4'],
'A': [0, 0, 0, 0, 0, 0 ,0, 0],
'B': [0.10, -0.15, 0.25, -0.55, 0.50, -0.51, 0.70, 0],
'C': [0, 0, 0.25, -0.55, 0.50, -0.51, 0.70, 0.90],
'D': [0.10, -0.15, 0.25, 0, 0.50, -0.51, 0.70, 0.90]})
df
name A B C D
0 apple1 0 0.10 0.00 0.10
1 apple2 0 -0.15 0.00 -0.15
2 apple3 0 0.25 0.25 0.25
3 apple4 0 -0.55 -0.55 0.00
4 orange1 0 0.50 0.50 0.50
5 orange2 0 -0.51 -0.51 -0.51
6 orange3 0 0.70 0.70 0.70
7 orange4 0 0.00 0.90 0.90
I'd like to drop all rows that have two or more zeros in columns A,B,C,D.
This DataFrame has other columns that have zeros; I only want to check for zeros in columns A,B,C,D.
You can use .eq to check if dataframe is equal to 0 and then take sum on axis=1 and return a boolean series by checking if the sum is greater than or equal to 2 (ge):
df[~df[['A','B','C','D']].eq(0).sum(1).ge(2)]
name A B C D
2 apple3 0 0.25 0.25 0.25
4 orange1 0 0.50 0.50 0.50
5 orange2 0 -0.51 -0.51 -0.51
6 orange3 0 0.70 0.70 0.70