I am using scikit learn ExtraTreesClassifier.
import pandas as pd
import datatable as dt
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
def __init__(self):
self.ExTrCl = ExtraTreesClassifier()
The prediction gives different execution time with Pandas DataFarme vs datatale Frame vs Numpy array !!
At the beginning, I generate a numpy 2d array based on the test dataset, generateTestPart(testDataSet, list_motifs_used_in_train).
I use three methods to make the prediction, using 1) pandas datafarme :
def test_groupe_score_pd(self, test_matrix)
start_time_0 = time.time()
dftest = pd.DataFrame(test_matrix,columns=self.list_motifs)
end_time = time.time()
print(" time creating DataFrame = ", end_time-start_time_0)
start_time = time.time()
result = self.ExTrCl.predict(dftest)
end_time = time.time()
print(" Time pred only = ",end_time-start_time," s")
print(" Time create + pred = ",end_time-start_time_0," s")
using datatable:
def test_groupe_score_dt(self, test_matrix):
start_0_time = time.time()
dt_dftest = dt.Frame(np.array(test_matrix),names=self.list_motifs)
end_time = time.time()
print(" time create Fram dt = ",end_time-start_0_time)
start_time = time.time()
result = self.ExTrCl.predict(dt_dftest)
end_time = time.time()
print(" Time pred only = ",end_time-start_time," s")
print(" Time pred + create = ",end_time-start_0_time," s")
using directly numpy:
def test_groupe_score_numpy(self, test_matrix):
start_0_time = time.time()
start_time = time.time()
result = self.ExTrCl.predict(test_matrix)
end_time = time.time()
print(" Time pred only = ",end_time-start_time," s")
The result , by using matrix of size > 500 Mo are in the following table :
It is very clear that for the prediction only, using pandas gives the best results, but we have to consider the creation of the dataframe also, hence, using directly numpy seems to be the best choice.
The Question is: why there is such a difference in the prediction time? Using pandas vs datatable vs numpy
Thank you for help.
I'm trying to create an efficient function for re-sampling time-series data.
Assumption: Both sets of time-series data have the same start and end time. (I do this in a separate step.)
Resample function (inefficient)
import numpy as np
def resample(desired_time_sequence, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, len(desired_time_sequence)).round().astype(int)
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
return downsampled_array
Speed testing
import timeit
def test_speed(): resample([1,2,3], [.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6])
print(timeit.timeit(test_speed, number=100000))
# 1.5003695999998854
Interested to hear any suggestions.
Replacing
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
with
downsampled_array = data_sequence[downsampling_indices]
provided 7x speedup on my testing data.
Code used to measure the speedup:
import timeit
f1 = """
def resample(output_len, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, output_len).round().astype(int)
downsampled_array = [data_sequence[ind] for ind in downsampling_indices]
return downsampled_array
resample(output_len, data_sequence)
"""
f2 = """
def resample_fast(output_len, data_sequence):
downsampling_indices = np.linspace(0, len(data_sequence)-1, output_len).round().astype(int)
downsampled_array = data_sequence[downsampling_indices]
return downsampled_array
resample_fast(output_len, data_sequence)
"""
setup="""
import numpy as np
data_sequence = np.random.randn(10000)
output_len = 752
"""
print(timeit.timeit(f1, setup, number=1000))
print(timeit.timeit(f2, setup, number=1000))
# prints:
# 0.30194038699846715
# 0.041797632933594286
I am looking for a gpu-accelerated n-dimensional sliding window operation implementation in Python using Tensorflow. You can post your implementation in Torch, Caffe or Theano, but I'll choose the Tensorflow implementation as the accepted answer. Please post working code snippet that performs a 2d median filter operation (hopefully, with no code change or minimal code change, can be applied to n-dimensional images)
With my limited knowledge on Tensorflow, I believe the 2 potential modules to start with are sliding_window_batch or extract_image_patches and then with some map,apply,reshape magic?
My failed attempt is posted below, for entertainment. Please note I have posted a similar question 2 years ago, asking for a Theano implementation, nowadays, most people are using tf/keras or torch.
import time
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from tensorflow.contrib.data.python.ops import sliding
from skimage import img_as_float, data
from scipy.signal import medfilt
imgs = img_as_float(data.camera())
### SCIPY median ###
stime = time.time()
scipysmoothed = medfilt(imgs,(9,9))
etime = time.time()
print('scipy smoothed: {:1.4f} seconds'.format(etime-stime))
### Failed attempt of TF median ###
method = 'Tensorflow'
stime = time.time()
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
# create TensorFlow Dataset object
data = tf.data.Dataset.from_tensor_slices(imgs)
# sliding window - only 1d is allowed?
window = 3
stride = 1
data = data.apply(sliding.sliding_window_batch(window, stride)).map(lambda x: window_func(x))
# create TensorFlow Iterator object
iterator = tf.data.Iterator.from_structure(data.output_types)
next_element = iterator.get_next()
# create initialization ops
init_op = iterator.make_initializer(data)
c=0
smoothed = np.zeros(imgs.shape)
with tf.Session() as sess:
# initialize the iterator on the data
sess.run(init_op)
while True:
try:
elem = sess.run(next_element)
smoothed[c,:]=elem
# obviously WRONG.
c+=1
except tf.errors.OutOfRangeError:
#print("End of dataset.")
break
#print(c)
etime = time.time()
print('tf smoothed: {:1.4f} seconds'.format(etime-stime))
plt.figure(figsize=(20,20))
plt.subplot(131)
plt.imshow(imgs,cmap='gray',interpolation='none')
plt.title('original')
plt.subplot(132)
plt.imshow(smoothed,cmap='gray',interpolation='none')
plt.title('actual smoothed\nwith {}'.format(method))
plt.subplot(133)
plt.imshow(scipysmoothed,cmap='gray',interpolation='none')
_=plt.title('expected smoothed')
.
scipy smoothed: 1.1899 seconds
tf smoothed: 0.7485 seconds
Proposal 1: My attempt is the below, since it just uses tf.image.extract_image_patches and tf.extract_volume_patches, the implementation supports only 2d and 3d images.
Proposal 2: one could just format the data as a preprocessing step (via tf.data.Dataset.map), however this also takes alot of time, I am not sure why yet ( example https://gist.github.com/pangyuteng/ca5cb07fe383ebe59b521c832f2e2918 ).
Proposal 3: use convolutional blocks to parallelize processing, see "Hypercolumns for Object Segmentation and Fine-grained Localization" https://arxiv.org/abs/1411.5752 .
--
Proposal 1 code:
import time
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import tensorflow as tf
from tensorflow.contrib.data.python.ops import sliding
from skimage import img_as_float, data
from scipy.signal import medfilt
dtype = 2
if dtype==2:
imgs = img_as_float(data.camera())
elif dtype==3:
imgs = np.random.rand(28,28,28)
imgs = img_as_float(data.camera())
### SCIPY median ###
stime = time.time()
scipysmoothed = medfilt(imgs,(9,9))
etime = time.time()
print('scipy smoothed: {:1.4f} seconds'.format(etime-stime))
### TF median ###
method = 'Tensorflow'
imgs = np.expand_dims(imgs,axis=-1)
imgs = np.expand_dims(imgs,axis=0)
print('imgs.shape:{}'.format(imgs.shape))
imgs = tf.cast(imgs,tf.float32)
stime = time.time()
if len(imgs.shape) == 4:
kernel=(1,9,9,1)
stride=(1,1,1,1)
rates=(1,1,1,1)
padding='SAME'
patches=tf.image.extract_image_patches(
imgs,kernel,stride,rates,padding,
)
_,x,y,n = patches.shape
_,sx,sy,_ = kernel
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
patches = tf.reshape(patches,[x*y,sx,sy])
smoothed = tf.map_fn(lambda x: window_func(patches[x,:,:]), tf.range(x*y), dtype=tf.float32)
smoothed = tf.reshape(smoothed,[x,y])
elif len(imgs.shape) == 5:
kernel=(1,12,12,12,1)
stride=(1,1,1,1,1)
padding='SAME'
patches=tf.extract_volume_patches(
imgs,kernel,stride,padding,
)
_,x,y,z,n = patches.shape
_,sx,sy,sz,_ = kernel
window_func = lambda x: tf.contrib.distributions.percentile(x, 50.0)
patches = tf.reshape(patches,[x*y*z,sx,sy,sz])
smoothed = tf.map_fn(lambda x: window_func(patches[x,:,:]), tf.range(x*y*z), dtype=tf.float32)
smoothed = tf.reshape(smoothed,[x,y,z])
else:
raise NotImplemented()
with tf.Session() as sess:
output = sess.run(smoothed)
etime = time.time()
print('tf smoothed: {:1.4f} seconds'.format(etime-stime))
print(output.shape)
plt.figure(figsize=(20,20))
plt.subplot(131)
imgs = img_as_float(data.camera())
plt.imshow(imgs.squeeze(),cmap='gray',interpolation='none')
plt.title('original')
plt.subplot(132)
plt.imshow(output.squeeze(),cmap='gray',interpolation='none')
plt.title('actual smoothed\nwith {}'.format(method))
plt.subplot(133)
plt.imshow(scipysmoothed,cmap='gray',interpolation='none')
_=plt.title('expected smoothed')
I was testing torchvision data-sets and I find out that after the first run if I comment the loading part of the data-set from my code it run faster (0.6654326915740967 seconds compare to 7.922324180603027 seconds). I think it does this because it need to reload the data-set, but where is the loaded data-set stored and why the code compile without it I don't know.
Why/How does it work ? How can I achieve the same thing without manually commenting and uncommenting ?
code:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
import time
import matplotlib.pyplot as plt
def ShowImage(imag):
temp=imag.view(3,96,96)
temp=temp.cpu()
temp =transforms.functional.to_pil_image(temp)
plt.imshow(temp)
plt.show()
path = '/home/Machine Learning/Datasets'
start = time.time()
ToTensor = transforms.Compose([transforms.ToTensor()])
train_data= torchvision.datasets.STL10(path ,
split='train'
,transform=ToTensor)
test_data= torchvision.datasets.STL10(path ,
split='test'
,transform=ToTensor)
dataloader_train = DataLoader(train_data,batch_size=1,shuffle=True)
for i,batch in enumerate(dataloader_train):
imag,label =batch
ShowImage(imag)
if(i>10):
break
end = time.time()
print(end - start)
#output 7.922324180603027
commented code:
import torch
import torchvision
from torchvision import transforms
from torch.utils.data import DataLoader
import time
import matplotlib.pyplot as plt
def ShowImage(imag):
temp=imag.view(3,96,96)
temp=temp.cpu()
temp =transforms.functional.to_pil_image(temp)
plt.imshow(temp)
plt.show()
path = '/home/Machine Learning/Datasets'
start = time.time()
# ToTensor = transforms.Compose([transforms.ToTensor()])
# train_data= torchvision.datasets.STL10(path ,
# split='train'
# ,transform=ToTensor)
# test_data= torchvision.datasets.STL10(path ,
# split='test'
# ,transform=ToTensor)
# dataloader_train = DataLoader(train_data,batch_size=1,shuffle=True)
for i,batch in enumerate(dataloader_train):
imag,label =batch
ShowImage(imag)
if(i>10):
break
end = time.time()
print(end - start)
#output 0.6654326915740967
Why does Anaconda Accelerate compute dot products slower than plain NumPy on Python 3? I'm using accelerate version 2.3.1 with accelerate_cudalib 2.0 installed, Python 3.5.2 Windows 10 64-bit.
import numpy as np
from accelerate.cuda.blas import dot as gpu_dot
import time
def numpydot():
start= time.time()
for i in range(100):
np.dot(np.arange(1000000, dtype=np.float64), np.arange(1000000, dtype=np.float64))
elapsedtime = time.time()-start
return elapsedtime
def acceleratedot():
start= time.time()
for i in range(100):
gpu_dot(np.arange(1000000, dtype=np.float64), np.arange(1000000, dtype=np.float64))
elapsedtime = time.time()-start
return elapsedtime
numpydot()
0.6446375846862793
acceleratedot()
1.33168363571167
I figured out that shared arrays are created with Numba, a separate library. They have the documentation on their site.