How to scattering a numpy array in python using comm.Scatterv - python

I am tring to write a MPI-based code to do some calculation using python and MPI4py. However, following the example, I CANNOT scatter a numpy vector into cores. Here is the code and errors, is there anyone can help me? Thanks.
import numpy as np
from mpi4py import MPI
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
n = 6
if rank == 0:
d1 = np.arange(1, n+1)
split = np.array_split(d1, size)
split_size = [len(split[i]) for i in range(len(split))]
split_disp = np.insert(np.cumsum(split_size), 0, 0)[0:-1]
else:
#Create variables on other cores
d1 = None
split = None
split_size = None
split_disp = None
split_size = comm.bcast(split_size, root = 0)
split_disp = comm.bcast(split_disp, root = 0)
d1_local = np.zeros(split_size[rank])
comm.Scatterv([d1, split_size, split_disp, MPI.DOUBLE], d1_local, root=0)
print('rank ', rank, ': ', d1_local)
And the error result is:
rank 2 : [ 2.47032823e-323]
rank 3 : [ 2.96439388e-323]
rank 0 : [ 4.94065646e-324 9.88131292e-324]
rank 1 : [ 1.48219694e-323 1.97626258e-323]
Thanks.

The data type is not correct. I should specify the type of the array:
d1 = np.arange(1, n+1, dtype='float64')

Related

mpi4py showing error when scattering string

I am trying to scatter an array of strings when I am scattering one char. it is working fine , however when I change to strings it showing an error like that scattering is not able to scatter strings
data = np.array(["mmm","bbbb","css","ddd","e","f","g","h"],dtype=np.str_)
import numpy as np
comm = MPI.COMM_WORLD
size = comm.Get_size() # new: gives number of ranks in comm
rank = comm.Get_rank()
numDataPerRank = 4
data = None
if rank == 0:
data = np.array(["m","b","c","d","e","f","g","h"],dtype=np.str_)
# when size=4 (using -n 4), data = [1.0:40.0]
recvbuf = np.empty(numDataPerRank, dtype=np.str_) # allocate space for recvbuf
comm.Scatter(data, recvbuf, root=0)
print('Rank: ',rank, ', recvbuf received: ',recvbuf)
data = np.array(["m","b","c","d","e","f","g","h"],dtype=np.str_) working with scatter
data = np.array(["mmm","bbbb","css","ddd","e","f","g","h"],dtype=np.str_) is not working

How to load data as a matrix (using csv file) into python?

I would like to solve an LPP (Max CX s.to. AX<=B, X>=0) using pyomo, I have loaded the data for C, and B. However I tried to load the matrix A but after solving the model it does not show the correct answer. The code that I tried is in the following lines. Please suggest how to solve this type of problem in pyomo.
from pathlib import Path
from pyomo.environ import *
import pandas as pd
import numpy as np
path_model_in_set = Path('tmp/input/set')
path_model_in_par = Path('tmp/input/par')
path_model_out = Path('tmp/output')
path_results = Path('tmp/output')
m = AbstractModel('LPP')
#Set
m.a = Set(ordered=True, doc="indices for row")
m.b = Set(ordered=True, doc="indices for column")
#Parameters
m.A = Param(m.a, m.b, mutable=True, initialize=0, doc="Matrix")
m.B = Param(m.a, doc="RHS")
m.C = Param(m.b, doc="cost coefficients")
#variable
m.x = Var(m.b, within=NonNegativeReals)
def obj_rule(m):
return sum(m.C[t]*m.x[t] for t in m.b)
m.object_f = Objective(rule=obj_rule, sense=maximize, doc="Objective function")
def cons_rule(m,i):
return sum(m.A[i,j]*m.x[j] for j in m.b) <= m.B[i]
m.cons1 = Constraint(m.a, rule=cons_rule, doc="constraints")
data = DataPortal()
data.load(filename=str(path_model_in_set.joinpath('a.csv')), format='set', set='a')
data.load(filename=str(path_model_in_set.joinpath('b.csv')), format='set', set='b')
#the problem is in this line.
data. Load(filename=str(path_model_in_par.joinpath('Matrix_info.csv')), format='array', index=['A'], param=['1','2','3'])
data. Load(filename=str(path_model_in_par.joinpath('B.csv')), index=['a'], param=['B'])
data.load(filename=str(path_model_in_par.joinpath('C.csv')), index=['b'], param=['C'])
instance = m.create_instance(data)
print(instance.A)
optimizer = 'gurobi'
solver = SolverFactory(optimizer)
instance.write('LPP.lp', io_options={'symbolic_solver_labels': True})
solver_results = solver.solve(instance, tee=True)
solver_results.write()
instance.solutions.load_from(solver_results)
CSV files for matrix A is
A 1 2 3
1 2 1 1
2 1 2 1
3 0 0 1

Importing an array using Python

I have an array Pe with dimensions (2,3,3). I am importing Pe but then I want the code to calculate for each Pe and give me visited indices. Right now, I have to manually add Pe.Pe[0],Pe.Pe[1] in the line Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150). How do I implement Pe.Pe[i] where i goes from 0 to 1?
Pe array is
import numpy as np
Pe = np.array([[[128.22918457, 168.52413295, 209.72343319],
[129.01598287, 179.03716051, 150.68633749],
[131.00688309, 187.42601593, 193.68172751]],
[[ 87.70103267, 115.2603484 , 143.4381863 ],
[ 88.23915528, 122.45062554, 103.06037156],
[ 89.60081102, 128.18809696, 132.46662659]]])
print([Pe])
The code is
import numpy as np
import time
import Pe
def get_neighbor_indices(position, dimensions):
'''
dimensions is a shape of np.array
'''
i, j = position
indices = [(i+1,j), (i-1,j), (i,j+1), (i,j-1)]
return [
(i,j) for i,j in indices
if i>=0 and i<dimensions[0]
and j>=0 and j<dimensions[1]
]
def iterate_array(init_i, init_j, arr, condition_func):
'''
arr is an instance of np.array
condition_func is a function (value) => boolean
'''
indices_to_check = [(init_i,init_j)]
checked_indices = set()
result = []
t0 = None
t1 = None
timestamps = []
while indices_to_check:
pos = indices_to_check.pop()
if pos in checked_indices:
continue
item = arr[pos]
checked_indices.add(pos)
if condition_func(item):
result.append(item)
t1=time.time()
if(t0==None):
t0=t1
timestamps.append(t1-t0)
indices_to_check.extend(
get_neighbor_indices(pos, arr.shape)
)
return result,timestamps
Visited_Indices,timestamps=iterate_array(0,0, Pe.Pe[0], lambda x : x < 150)
out = list(zip(*np.where(np.isin(Pe, Visited_Indices))))
print("Visited =",[Visited_Indices])

Why is my code generating a scalar array error despite providing reshape array command?

I am a new python learner. While writing code, from one of online course, I am having an error related to array. I reviewed multiple times, but remained unable to find the error.
Here is the code:
boston_dataset = load_boston()
data = pd.DataFrame(data=boston_dataset.data, columns=boston_dataset.feature_names)
features = data.drop(['INDUS','AGE'], axis=1)
log_prices = np.log(boston_dataset.target)
target = pd.DataFrame(log_prices, columns=['PRICE'])
property_stats = features.mean().values.reshape(1,11)
regr = LinearRegression().fit(features, target)
regr.predict(features)
fitted_vals = regr.predict(features)
MSE = mean_squared_error(target, fitted_vals)
RMSE = np.sqrt(MSE)
CRIME_IDX = 0
ZN_IDX = 1
CHAS_IDX = 2
NOX_IDX = 3
RM_IDX = 4
DIS_IDX = 5
RAD_IDX = 6
TAX_IDX = 7
PTRATIO_IDX = 8
B_IDX = 9
LSTAT_IDX = 10
def get_log_estimate(nr_rooms,
students_per_classroom,
next_to_river=False,
high_confidence=True):
property_stats[0][RM_IDX] = nr_rooms
property_stats[0][PTRATIO_IDX] = students_per_classroom
log_estimate = regr.predict(property_stats[0][0])
if next_to_river:
property_stats[0][CHAS_IDX] = 1
else:
property_stats[0][CHAS_IDX] = 0
if high_confidence:
upper_bound = log_estimate + 2*RMSE
lower_bound = log_estimate - 2*RMSE
interval = 95
else:
upper_bound = log_estimate + RMSE
lower_bound = log_estimate - RMSE
interval = 68
return log_estimate, upper_bound, lower_bound, interval
While running these lines of code, I am having this error:
"
ValueError: Expected 2D array, got scalar array instead:
array=3.6135235573122535.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
"
I called below line of code
get_log_estimate(5, 20)
But still getting the same error.
I am also new to Data Science so, I don't know the answer for sure. But in line 8 of your code , I would suggest you to try this: regr.predict([features])

How to resolve Boolean value error in linear regression model in python?

I am trying to run a fama-macbeth regression in a python. As afirst step I am running the time series for every asset in my portfolio but I am unable to run it because I am getting an error:
'ValueError: Must pass DataFrame with boolean values only'
I am relatively new to python and have heavily relied on this forum to help me out. I hope it you can help me with this issue.
Please let me know how I can resolve this. I will be very grateful to you!
I assume this line is producing the error. Cause when I run the function without the for loop, it works perfectly.
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
The dimension of my matrix is 108x35, 30 stocks and 5 factors over 108 points. Hence I want to run a regression for every stock against the 4 factors and store the result of the coeffs in a dataframe. Sample dataframe:
Date BAS GY AI FP SGL GY LNA GY AKZA NA Market Factor
1/29/2010 -5.28% -7.55% -1.23% -5.82% -7.09% -5.82%
2/26/2010 0.04% 13.04% -1.84% 4.06% -14.62% -14.62%
3/31/2010 10.75% 1.32% 7.33% 6.61% 12.21% 12.21%
The following is the entire code:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
data_set = pd.read_excel(r'C:\XXX\Research Project\Data\Regression.xlsx', sheet_name = 'Fama Macbeth')
data_set.set_index(data_set['Date'], inplace=True)
data_set.drop('Date', axis=1, inplace=True)
X = data_set.iloc[:,30:]
y = data_set.iloc[:,:30]
def RegressionRoll(df, subset, dependent, independent, const, win, parameters):
# Data subset
if subset != 0:
df = df.tail(subset)
else:
df = df
# Loopinfo
end = df.shape[0]
win = win
rng = np.arange(start = win, stop = end, step = 1)
# Subset and store dataframes
frames = {}
n = 1
for i in rng:
df_temp = df.iloc[:i].tail(win)
newname = 'df' + str(n)
frames.update({newname: df_temp})
n += 1
# Analysis on subsets
df_results = pd.DataFrame()
for frame in frames:
#print(frames[frame])
# Rolling data frames
dfr = frames[frame]
y = dependent
x = independent
if const == True:
x = sm.add_constant(dfr[x])
model = sm.OLS(dfr[y], x).fit()
else:
model = sm.OLS(dfr[y], dfr[x]).fit()
if parameters == 'beta':
theParams = model.params[0:]
coefs = theParams.to_frame()
df_temp = pd.DataFrame(coefs.T)
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_results = pd.concat([df_results, df_temp], axis = 0)
if parameters == 'R2':
theParams = model.rsquared
df_temp = pd.DataFrame([theParams])
indx = dfr.tail(1).index[-1]
df_temp['Date'] = indx
df_temp = df_temp.set_index(['Date'])
df_temp.columns = [', '.join(independent)]
df_results = pd.concat([df_results, df_temp], axis = 0)
return(df_results)
cols = len(y.columns)
for i in range(cols):
df_beta = RegressionRoll(df=data_set, subset = 0, dependent = data_set.iloc[:,i], independent = data_set.iloc[:,30:], const = True, parameters = 'beta',
win = 12)
ValueError: Must pass DataFrame with boolean values only

Categories