“For Loop” User Define function - python

I am trying to create a user defined function with 1 input (“Port”) and it should give me a list of 371 float, however, now I am getting only 1 element in the list. Would really appreciate any help. Thanks.
df['date'] = pd.to_datetime(df['date'], format = '%Y-%m-%d')
def function (port):
for year in range(1990,2021,1):
for month in range(1,13,1):
if year == 1990 and month == 1:
pass
else:
y = []
a = df
a = a.set_index('date')
a = a[a.index.year == year]
a = a[a.index.month == month - 1]
a.loc['Volatility'] = a.std()
a = a.T
a['Portfolio'] = pd.qcut(a['Volatility'], q=[0, .2, .4, .6, .8, 1],\
labels=['1','2','3','4','5']).astype(int)
port = a[a['Portfolio'] == port]
port.sort_values("Volatility")
b = port.index
#############################################################
c = df
c = c.set_index('date')
c = c[c.index.year == year]
c = c[c.index.month == month]
c = c[b]
c = c.T
c += 1
c["returns"] = (c.product(axis=1) - 1)
C_r = c['returns'].sum()
y.append(C_r)
return y
p = function(1)

It's because you initialize y as empty array inside the loop. So it will become [] in every epochs. In order solving your problem, you just need to initialize the empty array before the loop.
y = []
for year in range(1990,2021,1):
for month in range(1,13,1):
if year == 1990 and month == 1:
pass
else:
.....
y.append(...)
You can also simplify the conditional statement by just do
if not (year == 1990 and month == 1):
So you can get rid the else and pass since they aren't necessary.

Related

apply my function if...else statement with condition doesn't pass

my input(as example):
df = pd.DataFrame({'frame':[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],'sum_result_ICV':[0,1,1,1,2,2,2,2,1,1,1,1,1,1,1,0], 'sum_result_AO':[0,1,1,1,0,0,0,0,1,1,1,1,1,1,1,0]})
dd['result_ICV'] = 0
dd['result_ATO'] = 0
My code and my_func:
for z in range(0,len(cv_details)):
def result_func(row):
for i in range(0,len(dd)):
if row==2:
return(cv_details[z])
elif row==1:
if dd.loc[dd['sum_result_'+cv_details[z]]==2,'frame'].empty:
return ('ReviewNG-'+cv_details[z])
elif (dd['frame'][i]-dd.loc[dd.iloc[:,z+1]==2,'frame'].iloc[0]) <=3 :
return('NG-'+cv_details[z])
elif (dd['frame'][i]-dd.loc[dd.iloc[:,z+1]==2,'frame'].iloc[-1]) <=3 :
return('NG-'+cv_details[z])
else:
return('ReviewNG-'+cv_details[z])
elif row==0:
return('Other')
else:
return ""
dd.iloc[:,z+3]=dd.iloc[:,z+1].apply(result_func)
I expect:
But my output:
So as you can see: I need some condition, for example: "if sum_result_ICV equal 0 -> put "Other", if 'sum_result_ICV' equal 1 AND difference of (Number of Frame minus Number where first/last Frame==2) equal or less than 3 -> put "NG-ICV" in other wise "ReviewNG-ICV"(for example number of frame 11 where in sum_result_ICV was 1 and in distance from number 7 of frame where was sum_result_ICV equal 2, so 11-7>3 put "ReviewNG-ICV" ). In my example frame from 1 to 3 must be "NG-ICV", and also from 8 to 10. But from 11 to 14 it must be "ReviewBG-ICV". Also, please see pic that I expect from my function. So what I do wrong?
UPDATE based an answer of #woblob
That new code with loop:
for z in range(0,len(cv_details)):
df.iloc[df.iloc[:,z+1].to_numpy()==0, z+2 ] = 'Other'
mask2= df.iloc[:,z+1]==2
mask1 =df.iloc[:,z+1]==1
df.iloc[mask2,z+2]=cv_details[z]
if df.loc[mask2,'frame'].empty:
df.iloc[mask1,z+2]='ReviewNG-'+cv_details[z]
else:
df_frame_first=df.loc[mask2,'frame'].iloc[0]
df_frame_last=df.loc[mask2,'frame'] .iloc[-1]
mask_lt_3 = ((df.frame - df_frame_first) <= 3) | (df.frame - df_frame_last <= 3)
ones_lt_3 = mask1 & mask_lt_3
ones_not_lt_3 = mask1 & (~mask_lt_3)
df.iloc[ones_lt_3, z+2] = 'NG-'+cv_details[z]
df.iloc[ones_not_lt_3 , z+2] = 'ReviewNG-'+cv_details[z]
As I was trying to untangle the logic, I reworked it completely.
dd.loc[dd.result == 0, "sum_result"] = 'Other'
mask2 = dd.result == 2
mask1 = dd.result == 1
dd.loc[mask2, "sum_result"] = 'ICV'
if dd.loc[mask2,'frame'].empty:
dd.loc[mask1, "sum_result"] = 'No sum_result==2'
else:
dd_frame_first = dd.loc[mask2,'frame'].iloc[0]
dd_frame_last = dd.loc[mask2,'frame'].iloc[-1]
mask_lt_3 = ((dd.frame - dd_frame_first) <= 3) | (dd.frame - dd_frame_last <= 3)
ones_lt_3 = mask1 & mask_lt_3
ones_not_lt_3 = mask1 & (~mask_lt_3)
dd.loc[ones_lt_3, "sum_result"] = 'NG-ICV'
dd.loc[ones_not_lt_3 , "sum_result"] = 'ReviewNG-ICV'

Find entry exit signal from time series data

I have a small time-series data :
ser = pd.Series([2,3,4,5,6,0,8,7,1,3,4,0,6,4,0,2,4,0,4,5,0,1,7,0,1,8,5,3,6])
let's say if we choose a threshold of 5 to enter the market and zero to exit
I am trying to write a program which will generate an output like this :
so far I have used numba but still working on logic can you please help.
#numba.vectorize
def check_signal(x,t):
if x >= t :
y = 2
if x < t :
y =1
if x == 0:
y = -1
else :
y = y
return y
Why would you use numba unless you had tens of millions of these samples?
states = ["Entered market", "inside market", "market exit", "outside market"]
state = 2
fout = open('seriesdata.csv','w')
print("Time,Percent_change,Signal,Timestamp", file=fout)
for pct in ser:
stamp = ''
if state == 1 and pct == 0:
state = 2
stamp = str(len(data)+1)
elif state == 3 and pct >= 5:
state = 0
stamp = str(len(data)+1)
else if state in (0, 2):
state += 1
print(''.join((str(pct), states[state], stamp)), file=fout)
If you'd rather make a dataframe, just accumulate those values in a list and convert after.

modifying the dataframe column and get unexpected results

I have a dataframe listed like below:
There are actually 120000 rows in this data, and there are 20000 users, this is just one user. For every user I need to make sure the prediction is three "1" and three "0".
I wrote the following function to do that:
def check_prediction_quality(df):
df_n = df.copy()
unique = df_n['userID'].unique()
for i in range(len(unique)):
ex_df = df[df['userID']== unique[i]]
v = ex_df['prediction'].tolist()
v_bool = [i == 0 for i in v]
if sum(v_bool) != 3:
if sum(v_bool) > 3:
res = [i for i,val in enumerate(v_bool) if val]
diff = sum(v_bool) - 3
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(1)
res.remove(idx)
elif sum(v_bool) < 3:
res = [i for i,val in enumerate(v_bool) if not val]
diff = 3 - sum(v_bool)
for i in range(diff):
idx = np.random.choice(res,1)[0]
v[idx] = float(0)
res.remove(idx)
for j in range(len(v)):
df_n.loc[(0+i*6)+j:(6+i*6)+j,'prediction'] = v[j]
return df_n
However, when I run to check if the number of "0" and "1" are the same, turns it's not.. I am not sure what I did wrong.
sum([i == 0 for i in df['prediction']])
should be six using the below example, but when I run on my 120000 dataframe, it does not have 60000 on each
data = {'userID': [199810,199810,199810,199810,199810,199810,199812,199812,199812,199812,199812,199812],
'trackID':[1,2,3,4,5,6,7,8,9,10,11,12],
'prediction':[0,0,0,0,1,1,1,1,1,1,0,0]
}
df = pd.DataFrame(data = data)
df
Much appreciated!
When working with pandas dataframes you should reassign the post-processed Dataframe to the old one.
df = pd.DataFrame(np.array(...))
#reasignation:
df.loc[:,3:5] = df.loc[:,3:5]*10 #This multiplies the columns from 3 to 5 by 10
Actually never mind. I found out I don't have to modify the "0" and "1"..

No axis named 1 for object type Series

I'm using df.iloc[i] to assign (ori + gap) on each row of the dataframe. But I got 'No axis named 1 for object type Series' error. And I don't understand why.
df1 = pd.read_csv('异常销量监控_0127_to Xiaolei Eagle send.csv',low_memory=False)
df2 = pd.read_csv('test0412.csv',dtype = {'Actual':float})
gap = 0
for i in range(len(df2)):
ym = df2['YM'].iloc[i]
kcode = df2['REPKCode'].iloc[i]
fn = df2['FamilyNameE'].iloc[i]
ori = float(df2['Actual'].iloc[i])
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
df2['Actual'].iloc[i] = (ori + gap)
df2.to_csv('after.csv',index=False)
The issue is in the following line
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
the value of filt will be either 1 or 0 because you are checking multiple conditions
(df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
and if the above condition is true , filt will be assigned 1 otherwise 0.
So your condition was true and filt == 1
Now in the following line
gap = df1[filt]['Actual']
you are actually doing this
gap = df1[1]['Actual']
Its trying to find the column '1' in df1 dataframe and because there is no column as '1' thats why its giving you error.
EDIT
Reply to your comment - How can I get the 'Actual' value with 'YM','REPKCode','FamilyNameE' match in df1?
for that you need to write below lines
gap = df1[ df1['YM'] == ym ][ df1['REPKCode'] == kcode][ df1['FamilyNameE'] == fn]['Actual']
and remove below lines
filt = (df1['YM'] == ym )& (df1['REPKCode'] == kcode) & (df1['FamilyNameE'] == fn))
gap = df1[filt]['Actual']
I think the problem here is,
df2["YM"].iloc[i]
because when you are typed df2["YM"] it returns YM column from the dataframe df2.
So, that means you are trying to get a column from the column by typing df2["YM"].iloc[i].
Try, df2.iloc[i]

What is the fastest way to manipulate large csv files in Python?

I have been working on a python code, which reads a csv file with 800 odd rows and around 17000 columns.
I would like to check each entry in the csv file and see if this number is bigger than or smaller than a value, if it is, I assign a default value. I used pandas and worked with dataframes, apply and lambda functions. It takes me 172 minutes to finish going through all entries in the csv file. Is it normal? Is there any faster way to do this?. I am using Python 2.7. I don't know if it helps, but I am running it on a windows 10 machine with 32GB ram. Thanks in advance for the help.
The code is attached below.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
modified_dataframe = pd.DataFrame()
for k in col:
temp_data = some_dataframe.apply(lambda x: check_for_range(x[k]), axis=1).tolist()
dictionary = {}
dictionary[str(k)] = temp_data
temp_frame = pd.DataFrame(dictionary)
modified_dataframe = pd.concat([modified_dataframe, temp_frame], axis=1)
return modified_dataframe
def check_for_range(var):
var = int(var)
try:
if var == 0:
return 0
if var == 1 or var == 4:
return 1
if var == 2 or var == 3 or var == 5 or var == 6:
return 2
except:
print('error')
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
df_after_doing_something = do_something(some_dataframe)
df_after_doing_something.to_csv(output_folder + '\\df_after_doing_something.csv', index=False)
using pandas,for cvs data, makes it efficient. but your code is not efficient.it will be faster if you try code given blow.
def do_something(some_dataframe):
col = get_req_colm(some_dataframe)
col = col.to_numpy()
np_array = np.zeros_like(col)
for i in range(len(col)):
k = np_array[i]
temp_data = np.zeros_like()
temp_data[k == 1 or k == 4] = 1
temp_data[k == 2 or k == 3 or k == 5 or k == 6] = 2
np_array[i] = k
modified_dataframe = pandas.Dataframe(np_array)
return modified_dataframe
def get_req_colm(df):
col = list(df)
try:
col.remove('index/Sample count')
col.remove('index / Sample')
col.remove('index')
col.remove('count')
except:
pass
return col
it will work perfectly and don't forget to import numpy.
import numpy as np
if you didn't get this go and check some numpy tutorial and do it then. the link given below will help you otherwise
Replacing elements in a numpy array when there are multiple conditions

Categories