Python - Remove a row from numpy array? - python

Hi all what I wan't should be really simple for somebody here..I want to remove a row from a numpy array in a loop like:
for i in range(len(self.Finalweight)):
if self.Finalweight[i] >= self.cutoffOutliers:
"remove line[i from self.wData"
I'm trying to remove outliers from a dataset. My full code os the method is like:
def calculate_Outliers(self):
def calcWeight(Value):
pFinal = abs(Value - self.pMed)/ self.pDev_abs_Med
gradFinal = abs(gradient(Value) - self.gradMed) / self.gradDev_abs_Med
return pFinal * gradFinal
self.pMed = median(self.wData[:,self.yColum-1])
self.pDev_abs_Med = median(abs(self.wData[:,self.yColum-1] - self.pMed))
self.gradMed = median(gradient(self.wData[:,self.yColum-1]))
self.gradDev_abs_Med = median(abs(gradient(self.wData[:,self.yColum-1]) - self.gradMed))
self.workingData= self.wData[calcWeight(self.wData)<self.cutoffOutliers]
self.xData = self.workingData[:,self.xColum-1]
self.yData = self.workingData[:,self.yColum-1]
I'm getting the following error:
ile "bin/dmtools", line 201, in plot_gride
self.calculate_Outliers()
File "bin/dmtools", line 188, in calculate_Outliers
self.workingData= self.wData[calcWeight(self.wData)>self.cutoffOutliers]
ValueError: too many indices for array

There is actually a tool in NumPy specifically made to mask out outliers and invalid data points: masked arrays. Example from the linked page:
x = numpy.array([1, 2, 3, -1, 5])
mx = numpy.ma.masked_array(x, mask=[0, 0, 0, 1, 0])
print mx.mean()
prints
2.75

Related

Replace outlier values with NaN in numpy? (preserve length of array)

I have an array of magnetometer data with artifacts every two hours due to power cycling.
I'd like to replace those indices with NaN so that the length of the array is preserved.
Here's a code example, adapted from https://www.kdnuggets.com/2017/02/removing-outliers-standard-deviation-python.html.
import numpy as np
import plotly.express as px
# For pulling data from CDAweb:
from ai import cdas
import datetime
# Import data:
start = datetime.datetime(2016, 1, 24, 0, 0, 0)
end = datetime.datetime(2016, 1, 25, 0, 0, 0)
data = cdas.get_data(
'sp_phys',
'THG_L2_MAG_'+ 'PG2',
start,
end,
['thg_mag_'+ 'pg2']
)
x =data['UT']
y =data['VERTICAL_DOWN_-_Z']
def reject_outliers(y): # y is the data in a 1D numpy array
n = 5 # 5 std deviations
mean = np.mean(y)
sd = np.std(y)
final_list = [x for x in y if (x > mean - 2 * sd)]
final_list = [x for x in final_list if (x < mean + 2 * sd)]
return final_list
px.scatter(reject_outliers(y))
print('Length of y: ')
print(len(y))
print('Length of y with outliers removed (should be the same): ')
print(len(reject_outliers(y)))
px.line(y=y, x=x)
# px.scatter(y) # It looks like the outliers are successfully dropped.
# px.line(y=reject_outliers(y), x=x) # This is the line I'd like to see work.
When I run 'px.scatter(reject_outliers(y))', it looks like the outliers are successfully getting dropped:
...but that's looking at the culled y vector relative to the index, rather than the datetime vector x as in the above plot. As the debugging text indicates, the vector is shortened because the outlier values are dropped rather than replaced.
How can I edit my 'reject_outliers()` function to assign those values to NaN, or to adjacent values, in order to keep the length of the array the same so that I can plot my data?
Use else in the list comprehension along the lines of:
[x if x_condition else other_value for x in y]
Got a less compact version to work. Full code:
import numpy as np
import plotly.express as px
# For pulling data from CDAweb:
from ai import cdas
import datetime
# Import data:
start = datetime.datetime(2016, 1, 24, 0, 0, 0)
end = datetime.datetime(2016, 1, 25, 0, 0, 0)
data = cdas.get_data(
'sp_phys',
'THG_L2_MAG_'+ 'PG2',
start,
end,
['thg_mag_'+ 'pg2']
)
x =data['UT']
y =data['VERTICAL_DOWN_-_Z']
def reject_outliers(y): # y is the data in a 1D numpy array
mean = np.mean(y)
sd = np.std(y)
final_list = np.copy(y)
for n in range(len(y)):
final_list[n] = y[n] if y[n] > mean - 5 * sd else np.nan
final_list[n] = final_list[n] if final_list[n] < mean + 5 * sd else np.nan
return final_list
px.scatter(reject_outliers(y))
print('Length of y: ')
print(len(y))
print('Length of y with outliers removed (should be the same): ')
print(len(reject_outliers(y)))
# px.line(y=y, x=x)
px.line(y=reject_outliers(y), x=x) # This is the line I wanted to get working - check!
More compact answer, sent via email by a friend:
In numpy you can select/index based on a Boolean array, and then make assignment with it:
def reject_outliers(y): # y is the data in a 1D numpy array
n = 5 # 5 std deviations
mean = np.mean(y)
sd = np.std(y)
final_list = y.copy()
final_list[np.abs(y - mean) > n * sd] = np.nan
return final_list
I also noticed that you didn’t use the value of n in your example code.
Alternatively, you can use the where method (https://numpy.org/doc/stable/reference/generated/numpy.where.html)
np.where(np.abs(y - mean) > n * sd, np.nan, y)
You don’t need the .copy() if you don’t mind modifying the input array.
Replace np.mean and np.std with np.nanmean and np.nanstd if you want the function to work on arrays that already contain nans, i.e. if you want to use this function recursively.
The answer about using if else in a list comprehension would work, but avoiding the list comprehension makes the function much faster if the arrays are large.

How to iterate through non-zeros values of an image ? - Python

I have fond online a function to extract and display the dominant colors of an image. To save time, I want to iterate only on the non-zeros pixels instead of the whole image. However the way I changed the function raises an error :
if row != [0,0,0]:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Here is the modified code :
def dominantColor(image) :
from matplotlib.pyplot import imshow, show
from scipy.cluster.vq import whiten
from scipy.cluster.vq import kmeans
import pandas as pd
r = []
g = []
b = []
for row in image :
if row != [0,0,0]: #the part I added to the original code
print(row)
for temp_r, temp_g, temp_b in row:
r.append(temp_r)
g.append(temp_g)
b.append(temp_b)
image_df = pd.DataFrame({'red': r, 'green': g, 'blue': b})
image_df['scaled_color_red'] = whiten(image_df['red'])
image_df['scaled_color_blue'] = whiten(image_df['blue'])
image_df['scaled_color_green'] = whiten(image_df['green'])
cluster_centers, _ = kmeans(image_df[['scaled_color_red','scaled_color_blue','scaled_color_green']], 3)
dominant_colors = []
red_std, green_std, blue_std = image_df[['red','green','blue']].std()
for cluster_center in cluster_centers:
red_scaled, green_scaled, blue_scaled = cluster_center
dominant_colors.append((
red_scaled * red_std / 255,
green_scaled * green_std / 255,
blue_scaled * blue_std / 255
))
imshow([dominant_colors])
show()
return dominant_colors
How should I correct my iteration loop to remove the error and have only the non-zeros values of my image ? (NB : the image is actually mask * original_image)
You need to add .all() method after that comparison if you want co compare arrays element wise. So if (row == [0,0,0]).all().
import numpy as np
image = np.array([
[0, 0, 0],
[1, 0, 0],
[0, 0, 1],
])
for row in image:
if not (row == [0, 0, 0]).all():
print(row)
Result:
[1 0 0]
[0 0 1]
If I understand your code correctly, the answer is in the error log that you posted:
if row != [0,0,0]:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
So, the any function check if any of the elements in the row are evaluated as true:
for row in image :
if any(row): #enter the if block if any element is not 0
print(row)
for temp_r, temp_g, temp_b in row:
r.append(temp_r)
g.append(temp_g)
b.append(temp_b)

Audio Data Agmentation in python

I am using below function to augment audio data generated from wav audio files.
def generate_augmented_data(file_path):
augmented_data = []
samples = load_wav(file_path,get_duration=False)
for time_value in [0.7, 1, 1.3]:
for pitch_value in [-1, 0, 1]:
time_stretch_data = librosa.effects.time_stretch(samples, rate=time_value)
final_data = librosa.effects.pitch_shift(time_stretch_data, sr=sample_rate, n_steps=pitch_value)
augmented_data.append(final_data)
return augmented_data
I also need to augment the class labels and facing difficulties with it.
Tried below cod, but its not getting me the expected result
## generating augmented data.
def generate_augmented_data_label(file_path, label):
augmented_data = []
augmented_label = []
samples = load_wav(file_path,get_duration=False)
for time_value in [0.7, 1, 1.3]:
for pitch_value in [-1, 0, 1]:
time_stretch_data = librosa.effects.time_stretch(samples, rate=time_value)
final_data = librosa.effects.pitch_shift(time_stretch_data, sr=sample_rate, n_steps=pitch_value)
augmented_data.append(final_data)
augmented_label.append(label)
return augmented_data,augmented_label
Before augmentation shape for data and labels are as below,
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
X_train_augmented_data = []
y_train_augmented_data = []
for i in range(len(X_train)):
#print(i)
t1 = X_train.iloc[i]
t2 = y_train[i]
tmp1,tmp2 = generate_augmented_data_label(t1,t2)
#print(tmp1,tmp2)
X_train_augmented_data.append(tmp1)
y_train_augmented_data.append(tmp2)
len(X_train)
1600
len(y_train)
1600
print(len(X_train_augmented_data))
print(len(y_train_augmented_data))
After data augmentation and an additional masking step, shape is coming as
augmented_train_data_mask = []
for i in range(0,len(augmented_train_data_pad)):
augmented_train_data_mask.append(list(map(bool,augmented_train_data_pad[i])))
augmented_train_data_mask = np.array(augmented_train_data_mask)
print(augmented_train_data_pad.shape)
print(augmented_train_data_mask.shape)
(14400, 17640)
(14400, 17640)
However, label len is still 1600. Later when I pass these into an LSTM model, I am getting a shape mismatch error.
ValueError: Data cardinality is ambiguous:
x sizes: 14400, 14400
y sizes: 1600
Make sure all arrays contain the same number of samples.
Looking for some help to resolve this issue.
You can use numpy repeat function to replicate your numpy array.
ex:
In: arr = np.arange(3)
out: array([0, 1, 2])
In : arr.repeat(3)
Out: array([0, 0, 0, 1, 1, 1, 2, 2, 2])
Hope this will suffice your requirement.
You may refer link for reference:
#https://www.geeksforgeeks.org/python-add-similar-value-multiple-times-in-list/
type(y_train)= panda series
from itertools import repeat
new_label=[]
for index, value in y_train.items():
new_label.extend(repeat(value, 2))
len(new_label)

ValueError: x and y must have same first dimension, but have shapes (41,) and (1, 41)

I'm trying to plot a vs kappa_inv and I keep getting the error: ValueError: x and y must have same first dimension, but have shapes (41,) and (1, 41).
I saw a prev post about changing plt.plot square brackets to round ones but the error is still occurring. Can anyone see what I'm doing wrong?
import numpy
L = [20,20, 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20]
L = numpy.array(L)
delta = [0.5, 0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5]
delta = numpy.array(delta)
x = L/delta
a =[-0.5,0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5,6,6.5,7,7.5,8,8.5,9,9.5,10,10.5,11,11.5,12,12.5,13,13.5,14,14.5,15,15.5,16,16.5,17,17.5,18,18.5,19,19.5]
numpy.array(a)
#Force
F = 100 #kN
#calc sigma
y = 250 #mm
E = 32800 #MPa
I =1.837E9 #mm4
sig = y/(E*I)
print (sig)
kappa = []
b = []
y = 20
while y >= 0:
   b.append(y)
   y = y-0.5
numpy.array(b)
for val in a:
val = "{:.1f}".format(val)
val = float(val)
fraction = b/L
kappa_i = fraction * val
kappa.append(kappa_i)
b = b - delta
N = 4
Length = len(kappa)
pad_kappa = numpy.pad(kappa,(0,N),'constant', constant_values = 0)
print(pad_kappa)
#Calc bending moment list
BM = []
for k in range (0,Length):
bendingMoment = (pad_kappa[k]*F) + (pad_kappa[k+3]*F)
BM.append(bendingMoment)
print(BM)
Strain =[]
for j in range(0,len(BM)):
strain = (BM[j] * sig) * 10E6
Strain.append(strain)
kappa_inv = [ -x for x in kappa]
numpy.array(kappa_inv)
import matplotlib.pyplot as plt
plt.plot(a,kappa_inv)
plt.ylabel('KAPPA')
plt.xlabel('LENGTH ALONG BEAM')
plt.show()
#E = BM*10E6 * sigma
strainCalcReverse = []
for s in Strain:
bendYourMomLOL = s/sig * (1/10E6)
bendYourMomLOL.append(strainCalcReverse)
print(strainCalcReverse)
There's a lot of messy stuff in your code,
Lines like:
numpy.array(a) # doesn't change list a
numpy.array(b) # same
but
fraction = b/L # only works if b is an array, not a list.
Looks like this is trying to turn all elements in a to float, but that's not how a python loop works.
for val in a:
val = "{:.1f}".format(val)
val = float(val)
a = np.array(a) will produce a float array, so there's no need for this loop.
Anyways, it looks like kappa_i is an array. If so then the following demonstrates your error:
In [311]: kappa=[]
In [312]: kappa.append(np.arange(3))
In [313]: kappa
Out[313]: [array([0, 1, 2])]
In [314]: plt.plot([1,2,3], kappa)
Traceback (most recent call last):
File "<ipython-input-314-e15645b5613f>", line 1, in <module>
plt.plot([1,2,3], kappa)
File "/usr/local/lib/python3.8/dist-packages/matplotlib/pyplot.py", line 2988, in plot
return gca().plot(
File "/usr/local/lib/python3.8/dist-packages/matplotlib/axes/_axes.py", line 1605, in plot
lines = [*self._get_lines(*args, data=data, **kwargs)]
File "/usr/local/lib/python3.8/dist-packages/matplotlib/axes/_base.py", line 315, in __call__
yield from self._plot_args(this, kwargs)
File "/usr/local/lib/python3.8/dist-packages/matplotlib/axes/_base.py", line 501, in _plot_args
raise ValueError(f"x and y must have same first dimension, but "
ValueError: x and y must have same first dimension, but have shapes (3,) and (1, 3)
By using that list append, you made a list with one array element. When passed to plot that is produces a (1,n) array.
Correct your code, whether it's the actual code or the copy to the question. And pay closer attention to when variables are lists, or arrays, and if arrays, what's the shape and dtype.

How to create a 2D array with N lots of random numbers?

I am trying to obtain a variance for a value I obtained by processing a 2x150 array into a discrete correlation function. In order to do this I need to randomly sample 80% of the original data N times, which will allow me to calculate a variance over these values.
have so far been able to create one randomly sampled set of data using this:
rand_indices = []
running_var = (len(find_length)*0.8)
x=0
while x<running_var:
rand_inx = randint(0, (len(find_length)-1))
rand_indices.append(rand_inx)
x=x+1
which creates an array 80% of the length of my original with randomly selected indices to be picked out and processed.
My problem is that I am not sure how to iterate this in order to get N sets of these random numbers, I think ideally in a Nx120 sized array. My whole code so far is:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from random import randint
useless, just_to, find_length = np.loadtxt("w2_mjy_final.dat").T
w2_dat = np.loadtxt("w2_mjy_final.dat")
w2_rel = np.delete(w2_dat, 2, axis = 1)
w2_array = np.asarray(w2_rel)
w1_dat = np.loadtxt("w1_mjy_final.dat")
w1_rel = np.delete(w1_dat, 2, axis=1)
w1_array = np.asarray(w1_rel)
peaks = []
y=1
N = 0
x = 0
z = 0
rand_indices = []
rand_indices2d = []
running_var = (len(find_length)*0.8)
while z<N:
while x<running_var:
rand_inx = randint(0, (len(find_length)-1))
rand_indices.append(rand_inx)
x=x+1
rand_indices2d.append(rand_indices)
z=z+1
while y<N:
w1_sampled = w1_array[rand_indices, :]
w2_sampled = w2_array[rand_indices, :]
w1s_t, w1s_dat = zip(*w1_sampled)
w2s_t, w2s_dat = zip(*w2_sampled)
w2s_mean = np.mean(w2s_dat)
w2s_stdev = np.std(w2s_dat)
w1s_mean = np.mean(w1s_dat)
w1s_stdev = np.std(w1s_dat)
taus = []
dcfs = []
bins = 40
for i in w2s_t:
for j in w1s_t:
tau_datpoint = i-j
taus.append(tau_datpoint)
for k in w2s_dat:
for l in w1s_dat:
dcf_datpoint = ((k - w2s_mean)*(l - w1s_mean))/((w2s_stdev*w1s_stdev))
dcfs.append(dcf_datpoint)
plotdat = np.vstack((taus, dcfs)).T
sort_plotdat = sorted(plotdat, key=lambda x:x[0])
np.savetxt("w1sw2sarray.txt", sort_plotdat)
taus_sort, dcfs_sort = np.loadtxt("w1w2array.txt").T
dcfs_means, taubins_edges, taubins_number = stats.binned_statistic(taus_sort, dcfs_sort, statistic='mean', bins=bins)
taubin_edge = np.delete(taubins_edges, 0)
import operator
indexs, values = max(enumerate(dcfs_means), key=operator.itemgetter(1))
percents = values*0.8
dcf_lists = dcfs_means.tolist()
centarr_negs, centarr_poss = np.split(dcfs_means, [indexs])
centind_negs = np.argmin(np.abs(centarr_negs - percents))
centind_poss = np.argmin(np.abs(centarr_poss - percents))
lagcent_negs = taubins_edges[centind_negs]
lagcent_poss = taubins_edges[int((bins/2)+centind_poss)]
sampled_peak = (np.abs(lagcent_poss - lagcent_negs)/2)+lagcent_negs
peaks.append(sampled_peak)
y=y+1
print peaks
Seeing as you're using numpy already, why not use np.random.randint
In your case:
np.random.randint(len(find_length)-1, size=(N, running_var))
Would give you an N*running_var sized matrix, with random integer entries from 0 to len(find_length)-2 inclusive.
Example Usage:
>>> N=4
>>> running_var=6
>>> find_length = [1,2,3]
>>> np.random.randint(len(find_length)-1, size=(N, running_var))
array([[1, 0, 1, 0, 0, 1],
[1, 0, 1, 1, 0, 0],
[1, 1, 0, 0, 1, 0],
[1, 1, 0, 1, 0, 1]])

Categories