Most of my samples are repetitions, is there a way to give a weight to each sample that would represent how frequent it is so that the algorithm would only have to go through the unique set?
Or is there a way to manipulate the log(probability) function that I have defined to achieve this effect?
# simple example for data:
data = [(0,1,10), (0,2,10), (1,0,20), (1,0,20), (1,0,20), (0,0,49), (1,1,12)]
member_a = mc.Uniform('a', lower=-1.0, upper=0.0)
member_d = mc.Uniform('d', lower=-1.0, upper=0.0)
#mc.stochastic(observed=True, dtype=int)
def logLikelihood(value=data, a=member_a, d=member_d):
ratesMatrix = np.zeros((2,2))
ratesMatrix[0,0] = a
ratesMatrix[0,1] = -a
ratesMatrix[1,0] = -d
ratesMatrix[1,1] = d
r = []
t = []
for i in range(len(data)):
r.append(ratesMatrix[int(value[i][0]), int(value[i][1])])
t.append(value[i][2])
r = np.array(r, dtype=np.float64)
t = np.array(t, dtype=np.float64)
model = mc.MCMC([member_a,member_d,logLikelihood])
trace = model.sample(iter=5000)
Related
(Edited to include dataset and model code)
I'm training a Keras CNN 2d matrix. I'm creating my own training dataset, in which each matrix cell has the shape of [[list], int]. The cell's first list item is the product of a string class that I converts to list (using tf.keras.utils.to_categorical):
cell[0] = to_categorical(
rnd_type-1, num_classes=num_types)
the second is a simple int:
cell[1] = random.randint(0, max_val)
The dataset creation function:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
# create empty grid
grid = [[[[],0] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
# get random class
# and convert to cat list
cat = to_categorical(
rnd_type-1, num_classes=num_types)
# get random type
rnd_height = random.randint(0, max_height)
# inject the two values into the cell
grid[i][j] = [cat, rnd_height]
# get some target value
target += rnd_type * 5 + random.random()*5
target_list.append(target)
grids_list.append(grid)
# make np arrs out of the lists
t = np.array(target_list)
g = np.array(grids_list)
return t, g
my model is created using model = models.create_cnn(grid_size, grid_size, 2, regress=True) in which (I assumed) the Input depth is 2.
The model creation code:
num_types = 20
max_height = 50
num_of_samples = 10
grid_size = 10
epochs = 5000
# get n results of X x Y grid with target
targets_list, grids_list = datasets.make_data(
num_of_samples, num_types, max_height, grid_size, grid_size)
split = train_test_split(targets_list, grids_list,
test_size=0.25, random_state=42)
(train_attr_X, test_attr_X, train_grids_X, test_grids_X) = split
# find the largest value in the training set and use it to
# scale values to the range [0, 1]
max_target = train_attr_X.max()
train_attr_Y = train_attr_X / max_target
test_attr_Y = test_attr_X / max_target
model = models.create_cnn(grid_size, grid_size, 2, regress=True)
I however cannot train it given this error: ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).
Answer my own question:
model can only accept int as depth. Therefore, the depth of my matrix must by a list of int len, not a 2D matrix. For that reason, the way to merge class data with continuous field rnd_height is:
class => cat = to_categorical
cell = np.append(cat, [rnd_height])
This way, cat list is added with the rnd_height value.
The whole dataset function now look like this:
def make_data(num_of_samples, num_types, max_height, grid_x, grid_y):
grids_list = []
target_list = []
target = 0
for _ in range(num_of_samples):
grid = [[[False, False] for i in range(grid_y)] for j in range(grid_x)]
for i in range(grid_x):
for j in range(grid_y):
rnd_type = random.randint(
0, num_types)
cat = to_categorical(
rnd_type-1, num_classes=num_types)
rnd_height = random.randint(0, max_height)
cell = np.append(cat, [rnd_height])
grid[i][j] = cell
# simulate simple objective function
if rnd_type < num_types/5:
target += rnd_height * 5
target_list.append(target)
grids_list.append(grid)
t = np.array(target_list)
g = np.array(grids_list)
# return grids and targets
return g, t
I have written a function to compute KS values for various iterations of a logistic regression model. When I run the code, the KS value is printed on the screen but it is not getting stored in the table.
Custom KS Function
def ks_value(bad_flag=None, predicted_prob=None):
## Data Prep
ksdf = pd.DataFrame([])
ksdf['bad_flag'] = bad_flag
ksdf['probability'] = predicted_prob
ksdf = ksdf.reset_index()
ksdf.drop(columns=['index'],inplace=True)
ksdf['decile'] = pd.qcut(ksdf['probability'],10,labels=['1','2','3','4','5','6','7','8','9','10'])
ksdf['good_flag'] = 1-ksdf['bad_flag']
ksdf.head()
## Pivot
ksdf1 = pd.pivot_table(data=ksdf,index=['decile'],values=['bad_flag','good_flag','probability'],
aggfunc={'bad_flag':[np.sum],
'good_flag':[np.sum],
'probability' : [np.min,np.max]})
## Add Columns
ksdf1['total_counts'] = ksdf1['bad_flag']+ksdf1['good_flag']
ksdf1 = ksdf1.reset_index()
ksdf1.columns = ['Decile','Defaulter_Count','Non-Defaulter_Count','max_score','min_score','Total_Count']
ksdf1 = ksdf1.sort_values(by='min_score',ascending=False)
ksdf1['Default_Rate'] = (ksdf1['Defaulter_Count'] / ksdf1['Total_Count']).apply('{0:.2%}'.format)
default_sum = ksdf1['Defaulter_Count'].sum()
non_default_sum = ksdf1['Non-Defaulter_Count'].sum()
ksdf1['Default %'] = (ksdf1['Defaulter_Count']/default_sum).apply('{0:.2%}'.format)
ksdf1['Non_Default %'] = (ksdf1['Non-Defaulter_Count']/non_default_sum).apply('{0:.2%}'.format)
## Compute KS
ksdf1['ks_stats'] = np.round(((ksdf1['Defaulter_Count'] / ksdf1['Defaulter_Count'].sum()).cumsum() -(ksdf1['Non-Defaulter_Count'] / ksdf1['Non-Defaulter_Count'].sum()).cumsum()), 4) * 100
return(ksdf1['ks_stats'].max())
Code for iterating on Logistic Regression
# Iterating to find the Optimal value of C for model overfitting - Checks on Test Data
C_param_range = [0.001,0.01,0.1,1,10,100]
table1 = pd.DataFrame(columns = ['C_parameter','Test Accuracy','Train Accuracy','Test KS','Train KS'])
table1['C_parameter'] = C_param_range
j = 0
for i in C_param_range:
# Apply logistic regression model to training data
lr = LogisticRegression(penalty = 'l2', C = i,random_state = 0,max_iter = 1000)
lr.fit(X_train,y_train)
# Predict class (0,1) using model
y_pred = lr.predict(X_test)
y_pred2 = lr.predict(X_train)
y_prob = lr.predict_proba(X_test)[:,1]
y_prob2 = lr.predict_proba(X_train)[:,1]
# KS Value
table1.iloc[j,3] = ks_value(bad_flag=y_test, predicted_prob=y_prob)
table1.iloc[j,4] = ks_value(bad_flag=y_train, predicted_prob=y_prob2)
# Saving accuracy score in table
table1.iloc[j,1] = accuracy_score(y_test,y_pred)
table1.iloc[j,2] = accuracy_score(y_train,y_pred2)
j += 1
Output is something like this:
KS is 35.49
KS is 34.25
C_parameter TestAccuracy TrainAccuracy TestKS TrainKS
0.001 0.919911 0.919056 NaN NaN
I'm running a loop that appends values to an empty dataframe out side of the loop. However, when this is done, the datframe remains empty. I'm not sure what's going on. The goal is to find the power value that results in the lowest sum of squared residuals.
Example code below:
import tweedie
power_list = np.arange(1.3, 2, .01)
mean = 353.77
std = 17298.24
size = 860310
x = tweedie.tweedie(mu = mean, p = 1.5, phi = 50).rvs(len(x))
variance = 299228898.89
sum_ssr_df = pd.DataFrame(columns = ['power', 'dispersion', 'ssr'])
for i in power_list:
power = i
phi = variance/(mean**power)
tvs = tweedie.tweedie(mu = mean, p = power, phi = phi).rvs(len(x))
sort_tvs = np.sort(tvs)
df = pd.DataFrame([x, sort_tvs]).transpose()
df.columns = ['actual', 'random']
df['residual'] = df['actual'] - df['random']
ssr = df['residual']**2
sum_ssr = np.sum(ssr)
df_i = pd.DataFrame([i, phi, sum_ssr])
df_i = df_i.transpose()
df_i.columns = ['power', 'dispersion', 'ssr']
sum_ssr_df.append(df_i)
sum_ssr_df[sum_ssr_df['ssr'] == sum_ssr_df['ssr'].min()]
What exactly am I doing incorrectly?
This code isn't as efficient as is could be as noted by ALollz. When you append, it basically creates a new dataframe in memory (I'm oversimplifying here).
The error in your code is:
sum_ssr_df.append(df_i)
should be:
sum_ssr_df = sum_ssr_df.append(df_i)
I'm looking for a way to cluster set of features with DBSCAN algorithm in tensorflow however I'm unable to find anything related.
TensorFlow offers K-Means clustering (tf.contrib.learn.KMeansClustering), but I need DBSCAN algorithm.
Can anybody suggest me any existing wrappers written in python/java?
Any pointers on how to implement it from scratch?
P.S. I am aware of sklearn and similar libraries that has DBSCAN, but I specifically need in TensorFlow.
I know I'm like a year late to the party, but for any future reference:
here is my implementation of DBSCAN-like algorithm. It might give results slightly different from what you'd get from algorithm implemented in sklearn, for example, especially for observations that may belong to more than one clusters.
I know it's probably not optimal.
I know, that TF is not the best choice when it comes to implementing the algorithm.
But maybe someone will find the code valuable.
Relevant code:
import tensorflow as tf
import numpy as np
def run(vals, epsilon=4, min_points=4):
def merge_core_points_into_clusters(elems):
row = elems
mat = core_points_connection_matrix
nonempty_intersection_inds = tf.where(tf.reduce_any(tf.logical_and(row, mat), axis=1))
cumul = tf.logical_or(row, mat)
subcumul = tf.gather_nd(cumul, nonempty_intersection_inds)
return tf.reduce_any(subcumul, axis=0)
def label_clusters(elems):
return tf.reduce_min(tf.where(elems))
def get_subsets_for_labels(elems):
val = elems[0]
labels = elems[1]
conn = relation_matrix
inds = tf.where(tf.equal(labels, val))
masks = tf.gather_nd(conn, inds)
return tf.reduce_any(masks, axis=0)
def scatter_labels(elems):
label = tf.expand_dims(elems[0], 0)
mask = elems[1]
return label*tf.cast(mask, dtype=tf.int64)
data_np = np.array(vals)
eps = epsilon
min_pts = min_points
in_set = tf.placeholder(tf.float64)
# distance matrix
r = tf.reduce_sum(in_set*in_set, 1)
# turn r into column vector
r = tf.reshape(r, [-1, 1])
dist_mat = tf.sqrt(r - 2*tf.matmul(in_set, tf.transpose(in_set)) + tf.transpose(r))
# for every point show, which points are within eps distance of that point (including that point)
relation_matrix = dist_mat <= eps
# number of points within eps-ball for each point
num_neighbors = tf.reduce_sum(tf.cast(relation_matrix, tf.int64), axis=1)
# for each point show, whether this point is core point
core_points_mask = num_neighbors >= min_pts
# indices of core points
core_points_indices = tf.where(core_points_mask)
core_points_connection_matrix = tf.cast(core_points_mask, dtype=tf.int64) * tf.cast(relation_matrix, dtype=tf.int64)
core_points_connection_matrix = tf.cast(core_points_connection_matrix, dtype=tf.bool)
core_points_connection_matrix = tf.logical_and(core_points_connection_matrix, core_points_mask)
merged = tf.map_fn(
merge_core_points_into_clusters,
core_points_connection_matrix,
dtype=tf.bool
)
nonempty_clusters_records = tf.gather_nd(merged, core_points_indices)
marked_core_points = tf.map_fn(label_clusters, nonempty_clusters_records, dtype=tf.int64)
_, labels_core_points = tf.unique(marked_core_points, out_idx=tf.int64)
labels_core_points = labels_core_points+1
unique_labels, _ = tf.unique(labels_core_points)
labels_all = tf.scatter_nd(
tf.cast(core_points_indices, tf.int64),
labels_core_points,
shape=tf.cast(tf.shape(core_points_mask), tf.int64)
)
# for each label return mask, which points should have this label
ul_shape = tf.shape(unique_labels)
labels_tiled = tf.maximum(tf.zeros([ul_shape[0], 1], dtype=tf.int64), labels_all)
labels_subsets = tf.map_fn(
get_subsets_for_labels,
(unique_labels, labels_tiled),
dtype=tf.bool
)
final_labels = tf.map_fn(
scatter_labels,
elems=(tf.expand_dims(unique_labels, 1), labels_subsets),
dtype=tf.int64
)
final_labels = tf.reduce_max(final_labels, axis=0)
with tf.Session() as sess:
results = (sess.run(final_labels, feed_dict={in_set:data_np})).reshape((1, -1))
results = results.reshape((-1, 1))
return results
I need to randomly subsample a vector in pytorch.
The equivalent in Matlab would be something like
sample_size = 5
a = rand(10,1)
idx = randperm(10)
b = a(idx(1:sample_size))
Are there similar functions for pytorch?
I'm trying to randomly subsample my prediction and target vector for computing the loss.
I think I already found something useful:
sample_size = 5
a = torch.randn(10)
b = torch.randperm(10)
a = a.index_select(0,b)
a = a[0:sample_size]
Or you could simply do:
sample_size = 5
a = torch.randn(10)
b = torch.randperm(10)
a_sample = a[b[0:sample_size]]
that is to sample without replacement like in your question.
Or if you want to sample with replacement:
sample_size = 5
a = torch.randn(10)
b = torch.randint(0, 10, size=(sample_size,))
a_sample = a[b]