param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1,
min_samples_split=500,
min_samples_leaf=50,
max_depth=8,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test1,
scoring='roc_auc',
n_jobs=4,
iid=False,
cv=5)
gsearch1.fit(train[predictors],train[target]) #getting error on this line
Error:PermissionError: [WinError 32] The process cannot access the
file because it is being used by another process:
'C:\Users\xx\AppData\Local\Temp\joblib_memmapping_folder_xx\xx-xx-xx.pkl'
Windows 10
Python 3.6.4
sklearn.__version__
Out[26]: '0.20.2'
Please let me know how to fix this error.
It appears to be a common problem.
Googling the error (which is always good practice :-) ) led here, where they offer this workaround:
Thanks for the details. For now I'm using Parallel with max_nbytes=None
Related
I'm trying to do a multiclass classification project on COVID-19 imbalanced dataset using jupyter notebooks on kaggle, my code is :
import random
import gc
import numpy as np
from numpy import asarray
import itertools
train_dir='/kaggle/input/pandemic2/Training/Training'
test_dir='/kaggle/input/pandemic2/Testing/Testing'
train_covid=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'COVID' in I]
train_normal=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'Normal' in I]
train_pneumonia=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'MERS' or
'SARS'or
'Bacterial' or
'Chlamydia' or
'Influenza' or
'Klebsiella' or
'Legionella' or
'Mycoplasma' or
'Pneumocystis' or
'Streptococcus' or
'Varicella' in I]
test_imgs=['/kaggle/input/pandemic2/Testing/Testing/{}'.format(i) for i in os.listdir(test_dir)]
train_imgs=train_covid[:] + train_normal[:] + train_pneumonia[:]
random.shuffle(train_imgs)
del train_covid
del train_normal
del train_pneumonia
gc.collect()
nrows=150
ncolumns=150
channels= 3
def read_and_process_image (list_of_images):
x=[]
y=[]
for image in list_of_images:
x.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR),(nrows, ncolumns), interpolation=cv2.INTER_CUBIC))
if 'Normal' in image:
y.append(0)
elif 'COVID' in image:
y.append(1)
else:
y.append(2)
return x,y
x,y= read_and_process_image(train_imgs)
del train_imgs
gc.collect()
x=np.array(x)
print(x.shape)
y=np.array(y)
print(y.shape)
import sklearn
from keras import layers
from keras import models
from keras import optimizers
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy
from keras.applications import InceptionResNetV2
conv_base= InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(150,150,3))
model=models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
from keras import optimizers
model.compile(loss='categorical_crossentropy',optimizer=optimizers.Adam(lr=1e-4), metrics= ['categorical_accuracy'])
train_datagen=ImageDataGenerator(rescale=1./255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True )
val_datagen=ImageDataGenerator(rescale=1./255)
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=1)
scores = cross_val_score(model, x, y, cv=5, scoring= "balanced_accuracy")
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y, stratify=y, test_size=0.20, random_state=2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, train_size=0.8, random_state=2)
from keras.utils import to_categorical
y_train=to_categorical(y_train,3)
print('Shape of training lables is:', y_train.shape)
y_val=to_categorical(y_val,3)
print('Shape of validation lables is:', y_val.shape)
y_test=to_categorical(y_test,3)
print('Shape of test labels is:', y_test.shape)
for index, (train_indices, val_indices) in enumerate(skf.split(x, y)):
print ("Training on fold " + str(index+1) + "/" + str(n_splits))
# Generate batches from indices
xtrain, xval = x[train_indices], x[val_indices]
ytrain, yval = y[train_indices], y[val_indices]
ntrain=len(x_train)
nval=len(x_val)
batch_size=32
train_generator=train_datagen.flow(x_train,y_train, batch_size=batch_size)
val_generator=val_datagen.flow(x_val,y_val,batch_size=batch_size)
print ("Training new iteration on " + str(xtrain.shape[0]) + " training samples, " + str(xval.shape[0]) + " validation samples, this may take while...")
history=model.fit(train_generator,
steps_per_epoch=ntrain//batch_size,
epochs=80,
validation_data=(val_generator),
validation_steps=nval//batch_size,
verbose=2)
print('\nBalanced Accuracy:', mterics.balanced_accurcay*100, '%')
when I run the code it gives me the following error:
Traceback (most recent call last)
<ipython-input-7-45c4c9070141> in <module>
6 from keras.preprocessing.image import img_to_array, load_img
7 from sklearn.model_selection import cross_val_score
----> 8 from sklearn.metrics import balanced_accuracy
9
10 #Download the model
ImportError: cannot import name 'balanced_accuracy' from 'sklearn.metrics' (/opt/conda/lib/python3.7/site-packages/sklearn/metrics/__init__.py)
I've tried so many solutions like 1, 2, 3 and 4 but this solution has lead me to another complicated issue because whenever I run the command conda activate myenv it gives me the error:
CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
To initialize your shell, run
$ conda init <SHELL_NAME>
Currently supported shells are:
- bash
- fish
- tcsh
- xonsh
- zsh
- powershell
See 'conda init --help' for more information and options.
IMPORTANT: You may need to close and restart your shell after running 'conda init'.
Tackling this issue by understanding it thoroughly and trying what is provided in this thread lead me to the following error message:
/bin/bash: -c: line 0: syntax error near unexpected token `newline' /bin/bash: -c: line 0: `/opt/conda/bin/conda init <bash>'.
I have tried these solutions, 1 and 2 but had no luck !!
Then, when I got stuck and felt trapped, I tried to follow the conda official documentation to create a virtual environment with all the needed packages but I ran into the same error above stating that my shell is not properly configured to activate the new virtualenv !!
Again, I went back to do the simple solutions by reverting to the first version of my base environment by doing conda list revisions conda install --revision=0 and updating it but the error persists and still preventing me from using balanced_accuracy and other useful metrics.
I also tried to create a new jupyter notebook and start from scratch by updating my packages but they were already updated to the most recent versions !!
I believe that I'm running the set of configuration commands in the wrong order because my jupyter kernel and environment is a complete mess now.
It would be highly appreciated if someone can guide me to the best practices in setting up the environments for a deep learning task.
BTW: the solutions suggested manipulating the bashrc file are a bit confusing to me and I don't fully understand how it works.
I'm facing BrokenPipeError when I'm trying to run sentiment analysis with hugging face. It's returning [Error No] 32 Broken Pipe. Is there any way to rewrite the next(iter(train_data_loader)) code?
Link with total code 'https://colab.research.google.com/drive/1wBXKa-gkbSPPk-o7XdwixcGk7gSHRMas?usp=sharing'
The code is
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = GPReviewDataset(
reviews=df.content.to_numpy(),
targets=df.sentiment.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=4
)
Followed by below code
BATCH_SIZE = 16
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
Followed by
data = next(iter(train_data_loader))
data.keys()
I'm facing error with this 'data = next(iter(train_data_loader))' code
Error is BrokenPipeError: [Errno 32] Broken pipe
One of the reason of this issue might be the OS. When you're using Windows, you should not define num_worker, because PyTorch dataloader does not support multi-processing on Windows. By default num_workers is 0 and works on Windows.
DataLoader(
ds,
batch_size=batch_size,
num_workers=0 # should be zero on Windows
)
I'm trying to evaluate an ANN. I get the accuracies if I use n_jobs = 1, however, when I use n_jobs = - 1 I get the following error.
BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.
I have tried using other numbers but it only works if I use n_jobs = 1
This is the code I am running:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs = -1)
This is the error I am getting:
Traceback (most recent call last):
File "<ipython-input-12-cc51c2d2980a>", line 1, in <module>
accuracies = cross_val_score(estimator = classifier, X = X_train,
y = y_train, cv = 10, n_jobs = -1)
File "C:\Users\javie\Anaconda3\lib\site-
packages\sklearn\model_selection\_validation.py", line 402, in
cross_val_score
error_score=error_score)
File "C:\Users\javie\Anaconda3\lib\site-
packages\sklearn\model_selection\_validation.py", line 240, in
cross_validate
for train, test in cv.split(X, y, groups))
File "C:\Users\javie\Anaconda3\lib\site-
packages\sklearn\externals\joblib\parallel.py", line 930, in __call__
self.retrieve()
File "C:\Users\javie\Anaconda3\lib\site-
packages\sklearn\externals\joblib\parallel.py", line 833, in retrieve
self._output.extend(job.get(timeout=self.timeout))
File "C:\Users\javie\Anaconda3\lib\site-
packages\sklearn\externals\joblib\_parallel_backends.py", line 521, in
wrap_future_result
return future.result(timeout=timeout)
File "C:\Users\javie\Anaconda3\lib\concurrent\futures\_base.py", line
432, in result
return self.__get_result()
File "C:\Users\javie\Anaconda3\lib\concurrent\futures\_base.py", line
384, in __get_result
raise self._exception
BrokenProcessPool: A task has failed to un-serialize. Please ensure that
the arguments of the function are all picklable.`
Spyder should have analyzed each batch in parallel, but even when I use n_jobs = 1 it only analyzes 10 epochs.
This always happens when using multiprocessing in an iPython console in Spyder. A workaround is to run the script from the command line instead.
Just posting this for others, in case it's helpful. I Ran into the same issue today running a GridSearchCV on a Dask array / cluster. Sklearn v.0.24
Solved it by using the joblib context manager as described here: https://joblib.readthedocs.io/en/latest/parallel.html#thread-based-parallelism-vs-process-based-parallelism
I get this error as well, but only on Windows. I am using joblib to run a function (call it func_x) in parallel. That function is imported from a module, let's call it module_a.
module_a also uses a function (call it func_y) from another module, module_b, which it imports using the syntax import module_b.
I found that I can avoid the BrokenProcessPool error if I edit module_a and change the import line to from module_b import func_y.
I also had to remove the if __name__ == '__main__:' from the main script which was importing module_a.
I think this subtle difference in how modules are imported to the namespace determines whether that module can then be pickled by joblib for parallel processing in Windows.
I hope this helps!
--
A minimal reproducible example is below:
Original main.py
from joblib import Parallel, delayed
import module_a
if __name__ == '__main__':
Parallel(n_jobs=4, verbose=3)(delayed(module_a.func_x)(i) for i in range(50))
Original module_a.py (fails on Windows with BrokenProcessPool error; kernel restart required)
import module_b
def func_x(i):
j = i ** 3
k = module_b.func_y(j)
return k
Edited main.py
from joblib import Parallel, delayed
import module_a
Parallel(n_jobs=4, verbose=3)(delayed(module_a.func_x)(i) for i in range(50))
Edited module_a.py (succeeds on Windows)
from module_b import func_y # changed
def func_x(i):
j = i ** 3
k = func_y(j) # changed
return k
module_b.py
def func_y(m):
k = j ** 3
return k
If you use Spyder IDE simply switch to external terminal in settings (run>execute in an external system terminal).
Got a similar error when I was using RandomizedSearchCV with all possible cores and threads. Realized that I am getting this error only when the number of combinations I was sampling from the parameter space is a less number like 5. When I selected a higher n_iter value, the issue was fixed. I believe it has something to with the fact that the number fits happening should be at least greater than the number cores and parallel compute units on your system
Hope this might help someone later.
I'm trying to learn ML, and I am a noob.
Currently, I'm trying this video (https://www.youtube.com/watch?v=tNa99PG8hR8).
And my code is:
import pydotplus
import numpy as np
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.externals.six import StringIO
iris = load_iris()
##print(iris.feature_names)
##print(iris.target_names)
##print(iris.data[0])
##print(iris.target[0])
##for i in range(len(iris.target)):
## print("Example %d: label %s, features %s" % (i, iris.target[i], iris.data[i]))
test_idx = [0,50,100]
##training data
train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)
##testing data
test_target = iris.target[test_idx]
test_data = iris.data[test_idx]
clf = tree.DecisionTreeClassifier()
clf.fit(train_data, train_target)
print(test_target)
print(clf.predict(test_data))
dot_data = StringIO()
tree.export_graphviz(clf,
out_file=dot_data,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
impurity=False)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("iris.pdf")
But the error shows:
Traceback (most recent call last):
File "C:\Users\Denis\Desktop\Machine Learning\iris.py", line 42, in graph.write_pdf("iris.pdf")
File "C:\Users\Denis\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pydotplus\graphviz.py", line 1810, in prog=self.prog: self.write(path, format=f, prog=prog)
File "C:\Users\Denis\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pydotplus\graphviz.py", line 1918, in write fobj.write(self.create(prog, format))
File "C:\Users\Denis\AppData\Local\Programs\Python\Python36-32\lib\site-packages\pydotplus\graphviz.py", line 1960, in create 'GraphViz\'s executables not found') pydotplus.graphviz.InvocationException: GraphViz's executables not found
I've tried reinstall pydotplus and graphviz, but to no avail.
I have no clue on how to change path.
I've searched my graphviz folder, and I found no bin files.
So, you installed the graphviz library for python, but you haven't installed Graphviz software I guess.
You can install it from here and make sure that the directory containing the dot executable is on your systems’ path.
Good luck in ML journey! :-)
For all those who are facing this issue in windows 10 even after trying the above mentioned steps (i.e. installing Graphviz software seperately) , this worked for me - For Windows 10 users trying to debug this same error, launch CMD as administrator (important!) and run dot -c and then run dot -v
This fixed the issue for me
I am using Python and Keras on top of Tensorflow to train my neural networks.
When I switched from Ubuntu 16.04 to Windows 10, my model could not be saved anymore when I run the following:
filepath = "checkpoint-"+str(f)+model_type+"-"+optimizer_name+"-{epoch:02d}-{loss:.3f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]
and later on:
model.fit(X, y,
batch_size=128,
epochs=1,
shuffle=False,
callbacks=callbacks_list)
I get this Error:
OSError: Unable to create file (Unable to open file: name = 'checkpoint-<_io.textiowrapper name='data/swing-projects100-raw/many-chunks/log-gamma-f3.txt' mode='a' encoding='cp1252'>2l128-adam-0.001-{epoch:02d}-{loss:.3f}.h5', errno = 22, error message = 'invalid argument', flags = 13, o_flags = 302)
I have Keras 2.0.8 and h5py 2.7.0 installed via conda.
I tried
filepath = "checkpoint-"+str(f)+model_type+"-"+optimizer_name+"-{epoch:02d}-{loss:.3f}.hdf5"
with open(filepath, "w") as f:
f.write("Test.")
and got a similar error:
OSError: [Errno 22] Invalid argument: "checkpoint-<_io.TextIOWrapper name='data/swing-projects100-raw/many-chunks/log-gamma-f3.txt' mode='a' encoding='cp1252'>2L128-Adam-0.001-{epoch:02d}-{loss:.3f}.hdf5"
When I removed str(f) from the filepath, it worked.
f is an Integer and I don't know why it caused the error, but removing it from the string solved my problem.
Let me know if you know exactly why.
I had a similar problem with this code:
agent.save("./saved_models/weights_episode_{}.h5".format(e))
I solved it by manually creating the folder saved_models
e being an integer did not cause any problems in my case.
I have the similar problem when using tensorflow on a distant machine.
the reason of my maybe 'have no permission to modify the file'.
I solve this problem by use the save path like "../model.h5"———the folder where you have permission.
may this helps someone.