I am new to python and does not have a lot of experience.
I am trying to add constrains in this code so that the weights of one stock cannot be equal to 0, but the weights of the same stock cannot be above 5% either. This constrain will be valid for all the stocks.
This is what I have so far, is there anyone that have any tips on how to add these constrains?
Thanks in advance!
import os
import pandas as pd
import numpy as np
from scipy.optimize import linprog
data = pd.read_excel("data.xlsm")
# change data['WGT_GLOBAL'] s.t. EUTax = 20
data['Weights screened'] = data['WGT_GLOBAL']*data['Positiv screening']
EUTax = (data['Weights screened']*data['EU tax']).sum()
# min = -(x*data['YTD Return']).sum()
# s.t. x >= 0, x <= 1, (x*data['Positiv screening']*data['EU tax']).sum() = 20
obj = -(data['YTD Return'].fillna(0).to_numpy())
bnd = [(0,1)]
lhs_eq = [data['Positiv screening']*data['EU tax'].to_numpy(),np.ones(len(data))]
rhs_eq = [[20],[1]]
opt = linprog(c=obj, A_eq=lhs_eq, b_eq=rhs_eq, bounds=bnd, method="revised simplex")
optimal_weights = opt.x
I'm trying to calculate cape and cin in a specified layer using metpy's cape_cin and get_layer functions. Im doing this for a RAP vertical profile that I access from the NCDC server, for 3 different parcel types (ML, MU, and SB). I get this error ValueError: zero-size array to reduction operation minimum which has no identity when I try to compute the mu and sb profiles, but not for the ml profile...even though i calculate the layers in the exact same way:
First, I have to create vertical profiles of pressure, temp, dewpoint, and heights, from the RAP vertical profile:
from datetime import datetime, timedelta
from siphon.catalog import TDSCatalog
from siphon.ncss import NCSS
import numpy as np
from metpy.units import units
import cartopy.crs as ccrs
from metpy.calc import (dewpoint_from_relative_humidity, mixed_parcel, most_unstable_parcel, parcel_profile, pressure_to_height_std, lcl, height_to_pressure_std, get_layer, cape_cin)
year=2019
month=5
day=23
hour=0
cenlat = 34.91269
cenlon = -98.21048
time_start = datetime(year, month, day, hour, 0) #specified time
hour = time_start.hour
if hour < 10:
hour = '0'+str(hour)
day = time_start.day
if day < 10:
day = '0'+str(day)
month = time_start.month
if month < 10:
month = '0'+str(month)
cat = TDSCatalog('https://www.ncdc.noaa.gov/thredds/catalog/model-rap130-old/'+str(time_start.year)+str(month)+'/'+str(time_start.year)+str(month)+str(day)+'/catalog.html?dataset=rap130-old/'+str(time_start.year)+str(month)+'/'+str(time_start.year)+str(month)+str(day)+'/rap_130_'+str(time_start.year)+str(month)+str(day)+'_'+str(hour)+'00_000.grb2')
latest_ds = list(cat.datasets.values())[0]
print(latest_ds.access_urls)
ncss = NCSS(latest_ds.access_urls['NetcdfSubset'])
query = ncss.query()
query.variables('Pressure_surface').variables('Geopotential_height_isobaric').variables('Geopotential_height_surface').variables('Relative_humidity_isobaric').variables('Temperature_isobaric').variables('Dewpoint_temperature_height_above_ground').variables('Temperature_height_above_ground').variables
query.add_lonlat().lonlat_box(cenlon-2.1, cenlon +2.1, cenlat-2.1, cenlat+2.1)
data1 = ncss.get_data(query)
dlev = data1.variables['Geopotential_height_isobaric'].dimensions[1]
dlat = data1.variables['Geopotential_height_isobaric'].dimensions[2]
dlon = data1.variables['Geopotential_height_isobaric'].dimensions[3]
SFCP = (np.asarray(data1.variables['Pressure_surface'][:])/100.) * units('hPa')
hgt = np.asarray(data1.variables['Geopotential_height_isobaric'][:]) * units('meter')
sfc_hgt = np.asarray(data1.variables['Geopotential_height_surface'][:]) * units('meter')
Temp_up = np.asarray(data1.variables['Temperature_isobaric'][:]) * units('kelvin')
RH_up = np.asarray(data1.variables['Relative_humidity_isobaric'][:])
Td = (np.asarray(data1.variables['Dewpoint_temperature_height_above_ground'][:]) * units('kelvin')).to('degC')
T = np.asarray(data1.variables['Temperature_height_above_ground'][:]) * units('kelvin')
# Get the dimension data
lats_r = data1.variables[dlat][:]
lons_r= data1.variables[dlon][:]
lev = (np.asarray(data1.variables[dlev][:])/100.) * units('hPa')
# Set up our array of latitude and longitude values and transform to the desired projection.
flon = float(cenlon)
flat = float(cenlat)
crs = ccrs.PlateCarree()
crlons, crlats = np.meshgrid(lons_r[:]*1000, lats_r[:]*1000)
trlatlons = crs.transform_points(ccrs.LambertConformal(central_longitude=265, central_latitude=25, standard_parallels=(25.,25.)),crlons,crlats)
trlons = trlatlons[:,:,0]
trlats = trlatlons[:,:,1]
dlon = np.abs(trlons - cenlon)
dlat = np.abs(trlats - cenlat)
ilon = np.where(dlon == np.min(dlon)) #position in the dlon array with minimal difference between gridpoint lon and input lon
ilat = np.where(dlat == np.min(dlat)) #position in the dlat array with minimal difference between gridpoint lat and input lat
Tdc_up = dewpoint_from_relative_humidity(Temp_up[0,:,ilat[0][0], ilon[1][0]],RH_up[0,:,ilat[0][0], ilon[1][0]]/100)
p_sounding = np.sort(np.append(lev, SFCP[0,ilat[0][0], ilon[1][0]]))
ind = np.where(p_sounding >= SFCP[0,ilat[0][0], ilon[1][0]])[0][0]
hgt_sounding = np.insert(hgt[0,:,ilat[0][0], ilon[1][0]].magnitude, ind, sfc_hgt[0,ilat[0][0], ilon[1][0]].magnitude) * hgt.units
T_sounding = (np.insert(Temp_up[0,:,ilat[0][0], ilon[1][0]].magnitude, ind, T[0,0,ilat[0][0], ilon[1][0]].magnitude) * T.units).to(Tdc_up.units)
Td_sounding = np.insert(Tdc_up.magnitude, ind, Td[0,0,ilat[0][0], ilon[1][0]].magnitude) * Tdc_up.units
p_skewt = p_sounding[p_sounding <= SFCP[0,ilat[0][0], ilon[1][0]]]
hgt_skewt = hgt_sounding[p_sounding <= SFCP[0,ilat[0][0], ilon[1][0]]]
T_skewt = T_sounding[p_sounding <= SFCP[0,ilat[0][0], ilon[1][0]]]
Td_skewt = Td_sounding[p_sounding <= SFCP[0,ilat[0][0], ilon[1][0]]]
AGLhgts = hgt_skewt[::-1]-hgt_skewt[-1]
Next, I create vertical profiles for each parcel type using mixed_parcel, most_unstable, and parcel_profile functions, and compute the pressure values for th etop and bottom of the layer I want to calculate cape_cin for (the LCL to LCL+2km):
ml_p, ml_T, ml_Td = mixed_parcel(np.flip(p_skewt), np.flip(T_skewt), np.flip(Td_skewt))
ml_profile = parcel_profile(p_skewt[::-1], ml_T, ml_Td)
ml_profile = (ml_profile - 273.15*units('kelvin')).magnitude*units('degC')
mu_p, mu_T, mu_Td, mu_index = most_unstable_parcel(np.flip(p_skewt), np.flip(T_skewt), np.flip(Td_skewt))
mu_profile = parcel_profile(p_skewt[::-1], mu_T, mu_Td)
mu_profile = (mu_profile - 273.15*units('kelvin')).magnitude*units('degC')
#Note: sbpcl_profile will have the exact same values of p_skewt, T_skewt, and Td_skewt in pprof below:
pprof = parcel_profile(p_skewt[::-1], T_skewt[-1], Td_skewt[-1])
pprof = (pprof - 273.15*units('kelvin')).magnitude*units('degC')
mllcl = lcl(ml_p, ml_T, ml_Td)
mllcl_h = pressure_to_height_std(mllcl[0]) - hgt_skewt[-1]
mulcl = lcl(mu_p, mu_T, mu_Td)
mulcl_h = pressure_to_height_std(mulcl[0]) - hgt_skewt[-1]
sblcl = lcl(p_skewt[-1], T_skewt[-1], Td_skewt[-1])
sblcl_h = pressure_to_height_std(sblcl[0]) - hgt_skewt[-1]
mllcl2000 = mllcl_h + 2*units('kilometer')
mulcl2000 = mulcl_h + 2*units('kilometer')
sblcl2000 = sblcl_h + 2*units('kilometer')
mllcl2000_p = height_to_pressure_std(mllcl2000)
mulcl2000_p = height_to_pressure_std(mulcl2000)
sblcl2000_p = height_to_pressure_std(sblcl2000)
With all of that computed, I use the get_layer function to create the arrays of pressure, temp, dewpoint, and parcel temp I need to compute cape_cin, and then go to compute the actual cape_cin values in the layer of interest:
ml_LCL_CAPE_layer = get_layer(p_skewt, T_skewt, Td_skewt, ml_profile[::-1], bottom = mllcl[0], depth = mllcl[0] - mllcl2000_p)
mu_LCL_CAPE_layer = get_layer(p_skewt, T_skewt, Td_skewt, mu_profile[::-1], bottom = mulcl[0], depth = mulcl[0] - mulcl2000_p)
sb_LCL_CAPE_layer = get_layer(p_skewt, T_skewt, Td_skewt, pprof[::-1], bottom = sblcl[0], depth = sblcl[0] - sblcl2000_p)
mlLCLCAPE = cape_cin(ml_LCL_CAPE_layer[0], ml_LCL_CAPE_layer[1], ml_LCL_CAPE_layer[2], ml_LCL_CAPE_layer[3])
muLCLCAPE = cape_cin(mu_LCL_CAPE_layer[0], mu_LCL_CAPE_layer[1], mu_LCL_CAPE_layer[2], mu_LCL_CAPE_layer[3])
sbLCLCAPE = cape_cin(sb_LCL_CAPE_layer[0], sb_LCL_CAPE_layer[1], sb_LCL_CAPE_layer[2], sb_LCL_CAPE_layer[3])
mlLCLCAPEcin = mlLCLCAPE[0] + mlLCLCAPE[1]
muLCLCAPEcin = muLCLCAPE[0] + muLCLCAPE[1]
sbLCLCAPEcin = sbLCLCAPE[0] + sbLCLCAPE[1]
The arrays for pressure, temp, dewpoint, and parcel temp for each of the 3 get_layer functions appear to be populated with the correct values, and these 4 arrays for each parcel type are all the same shape. The mlLCLCAPEcin calculation above gives the correct output (99.26 j/kg - which verifies when I plot it on a SkewT), but the exact same calculation for the MU and SB profiles give the error referenced above. I'm using Metpy v 1.1, and have tried to use a different location and output from a different forecast hour, and still run into the same issue.
If I fix up your example code above (there were some name issues and some indexing issues) with:
T_sounding = (np.insert(Temp_up[0,:,ilat[0][0], ilon[1][0]].magnitude, ind, Temp_up[0,0,ilat[0][0], ilon[1][0]].magnitude) * Temp_up.units).to(Tdc_up.units)
Td_sounding = np.insert(Tdc_up.magnitude, ind, Tdc_up[0].magnitude) * Tdc_up.units
I don't get any error running right now. If my fix above doesn't fix it for you, it would be helpful to know if you're running the latest MetPy 1.1.
I have this formula that is used to predict athletic performance base on daily stress.
It is based on 5 constant unique to each person. I'm trying to find these based on daily stress and performance testing that has been done. I'm new to programming and I don't know where to start.
see the formula
Performance= Fitness(=daily stress+yesterday fitness put decay) - Fatigue(daily stress+yesterday fatigue put decay) +P0
This is a sample of the data: data
thank you
import pandas as pd
import numpy as np
import math
from scipy import optimize
data = pd.read_csv('data_mod1.csv')
TSS = data['stress'].fillna(0)
arr = np.array(TSS)
#data = data.dropna()
a = [arr[0]]
b = [arr[0]]
x = arr[1:]
def Banister(x, t1, t2,k1,k2, c):
for v in x:
a.append(a[-1]*np.exp(-1/t1) + v)
b.append(b[-1]*np.exp(-1/t2) + v)
data['fit'] = pd.Series(a)
data['fat'] = pd.Series(b)
data['perf'] = ((data['fit']*k1)-(data['fat']*k2))+c
return data['perf']
# In[ ]:
from scipy.optimize import curve_fit
fit = curve_fit(Banister, arr,data[data.index], p0=[20, 10,1 ,2, 50])
I am very new to Data Science and Python. After a few hours of Experimentation, I finally received values for my gradient descent (code below). I am having trouble to plotting bzw. How can I plot the regression line automatically after the algorithm?
import numpy as np;
import matplotlib.pyplot as plt;
import csv
import pandas as pd
def gradient_descent(x,y):
m_curr=b_curr=0
iterations = 5000
n=len(x)
learning_rate = 0.01
for i in range(iterations):
y_predicted = m_curr*x + b_curr
cost = (1/n)*sum([val**2 for val in (y-y_predicted)])
md = -(2/n)*sum(x*(y-y_predicted))
bd = -(2/n)*sum(y-y_predicted)
m_curr = m_curr - learning_rate*md
b_curr = b_curr - learning_rate*bd
print("m{}, b{}, cost {}, iteration {}".format(m_curr,b_curr,cost,i))
if __name__ == '__main__':
#Reading data -> Output: DataFrame in float64
data = pd.read_csv('ex1data1.txt', sep=',', header=None, names=['Feature', 'Label'])
data.plot(x='Feature', y='Label', kind = 'scatter')
#separating data frame to
feat_vec = pd.DataFrame(data['Feature'])
label_vec = pd.DataFrame(data['Label'])
#Finding the Best Fit Line for our given Dataset and convert the df to np.array
#because it's more convenient for matrix multiplication
x = np.array(feat_vec)
y = np.array(label_vec)
gradient_descent(x,y)
I am trying to run a fama-macbeth regression in a python. As afirst step I am running the time series for every asset in my portfolio but I am unable to run it because I am getting an error:
'ValueError: Must pass DataFrame with boolean values only'
I am relatively new to python and have heavily relied on this forum to help me out. I hope it you can help me with this issue.
Please let me know how I can resolve this. I will be very grateful to you!
I assume this line is producing the error. Cause when I run the function without the for loop, it works perfectly.
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
The dimension of my matrix is 108x35, 30 stocks and 5 factors over 108 points. Hence I want to run a regression for every stock against the 4 factors and store the result of the coeffs in a dataframe. Sample dataframe:
Date BAS GY AI FP SGL GY LNA GY AKZA NA Market Factor
1/29/2010 -5.28% -7.55% -1.23% -5.82% -7.09% -5.82%
2/26/2010 0.04% 13.04% -1.84% 4.06% -14.62% -14.62%
3/31/2010 10.75% 1.32% 7.33% 6.61% 12.21% 12.21%
The following is the entire code:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
data_set = pd.read_excel(r'C:\XXX\Research Project\Data\Regression.xlsx', sheet_name = 'Fama Macbeth')
data_set.set_index(data_set['Date'], inplace=True)
data_set.drop('Date', axis=1, inplace=True)
X = data_set.iloc[:,30:]
y = data_set.iloc[:,:30]
def RegressionRoll(df, subset, dependent, independent, const, win, parameters):
# Data subset
if subset != 0:
df = df.tail(subset)
else:
df = df
# Loopinfo
end = df.shape[0]
win = win
rng = np.arange(start = win, stop = end, step = 1)
# Subset and store dataframes
frames = {}
n = 1
for i in rng:
df_temp = df.iloc[:i].tail(win)
newname = 'df' + str(n)
frames.update({newname: df_temp})
n += 1
# Analysis on subsets
df_results = pd.DataFrame()
for frame in frames:
#print(frames[frame])
# Rolling data frames
dfr = frames[frame]
y = dependent
x = independent
if const == True:
x = sm.add_constant(dfr[x])
model = sm.OLS(dfr[y], x).fit()
else:
model = sm.OLS(dfr[y], dfr[x]).fit()
if parameters == 'beta':
theParams = model.params[0:]
coefs = theParams.to_frame()
df_temp = pd.DataFrame(coefs.T)
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_results = pd.concat([df_results, df_temp], axis = 0)
if parameters == 'R2':
theParams = model.rsquared
df_temp = pd.DataFrame([theParams])
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_temp.columns = [', '.join(independent)]
df_results = pd.concat([df_results, df_temp], axis = 0)
return(df_results)
cols = len(y.columns)
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
ValueError: Must pass DataFrame with boolean values only