Graphviz from sklearn cannot find my file? - python

So i'm trying to build a decisiontree in python using sklearn.
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import graphviz
cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,
stratify=cancer.target, random_state=42)
tree = DecisionTreeClassifier(random_state=0, max_depth=4)
tree.fit(X_train, y_train)
export_graphviz(tree, out_file=r"C:\Users\obaro\OneDrive\Documents\tree.dot", class_names=["malignant", "benign"],
feature_names=cancer.feature_names, impurity=False, filled=True)
with open(r"C:\Users\obaro\OneDrive\Documents\tree.dot") as f:
dot_graph = f.read()
display(graphviz.Source(dot_graph))
When I try to run this code in Jupyter, though, I get a FileNotFound error and an ExecutableNotFound error. At first, I tried using a relative path, but that didn't work, so I tried using an absolute path. The file WAS created and is in my current home directory so I'm not sure what's going on here. Any help would be appreciated, thanks.

Related

NameError: name 'X_train' is not defined....tried in spyder too

have been trying to run this statement in colab but it's always showing the NameError
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.33,random_state = 42)
pl help to solve this problem

Import Error while using new xgboost-distribution

I'm using the new xgboost-distribution module
I input the following into my file
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from xgboost_distribution import XGBDistribution
if __name__ == '__main__':
data = load_boston()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = XGBDistribution(
distribution="normal",
n_estimators=500
)
model.fit(
X_train, y_train,
eval_set=[(X_test, y_test)],
early_stopping_rounds=10
)
When I run this code, however, it generates an import error within the xgboost/sklearn.py module file:
from xgboost_distribution import XGBDistribution
File "/Users/timgundersen/opt/anaconda3/lib/python3.8/site-packages/xgboost_distribution/__init__.py", line 18, in <module>
from xgboost_distribution.model import XGBDistribution # noqa
File "/Users/timgundersen/opt/anaconda3/lib/python3.8/site-packages/xgboost_distribution/model.py", line 7, in <module>
from xgboost.sklearn import XGBModel, _wrap_evaluation_matrices, xgboost_model_doc
ImportError: cannot import name '_wrap_evaluation_matrices' from 'xgboost.sklearn' (/Users/timgundersen/opt/anaconda3/lib/python3.8/site-packages/xgboost/sklearn.py)
I tried updating sklearn, as well as xgboost and other modules that sklearn requires.
Why would this be giving me an import error if my computer is updated with all of the correct versions ?
Which python version do you use?
In case of python 3.8, it should work with scikit-learn 0.24.1
pip install scikit-learn==0.24.1

Why the error : "cannot import name 'balanced_accuracy' from 'sklearn.metrics' " is more complicated than it should?

I'm trying to do a multiclass classification project on COVID-19 imbalanced dataset using jupyter notebooks on kaggle, my code is :
import random
import gc
import numpy as np
from numpy import asarray
import itertools
train_dir='/kaggle/input/pandemic2/Training/Training'
test_dir='/kaggle/input/pandemic2/Testing/Testing'
train_covid=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'COVID' in I]
train_normal=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'Normal' in I]
train_pneumonia=['/kaggle/input/pandemic2/Training/Training/{}'.format(i) for i in os.listdir(train_dir) if 'MERS' or
'SARS'or
'Bacterial' or
'Chlamydia' or
'Influenza' or
'Klebsiella' or
'Legionella' or
'Mycoplasma' or
'Pneumocystis' or
'Streptococcus' or
'Varicella' in I]
test_imgs=['/kaggle/input/pandemic2/Testing/Testing/{}'.format(i) for i in os.listdir(test_dir)]
train_imgs=train_covid[:] + train_normal[:] + train_pneumonia[:]
random.shuffle(train_imgs)
del train_covid
del train_normal
del train_pneumonia
gc.collect()
nrows=150
ncolumns=150
channels= 3
def read_and_process_image (list_of_images):
x=[]
y=[]
for image in list_of_images:
x.append(cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR),(nrows, ncolumns), interpolation=cv2.INTER_CUBIC))
if 'Normal' in image:
y.append(0)
elif 'COVID' in image:
y.append(1)
else:
y.append(2)
return x,y
x,y= read_and_process_image(train_imgs)
del train_imgs
gc.collect()
x=np.array(x)
print(x.shape)
y=np.array(y)
print(y.shape)
import sklearn
from keras import layers
from keras import models
from keras import optimizers
from sklearn.model_selection import StratifiedKFold
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array, load_img
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy
from keras.applications import InceptionResNetV2
conv_base= InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(150,150,3))
model=models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
from keras import optimizers
model.compile(loss='categorical_crossentropy',optimizer=optimizers.Adam(lr=1e-4), metrics= ['categorical_accuracy'])
train_datagen=ImageDataGenerator(rescale=1./255,
rotation_range=40,
width_shift_range=0.2,
height_shift_range=0.2,
shear_range=0.2,
zoom_range=0.2,
horizontal_flip=True )
val_datagen=ImageDataGenerator(rescale=1./255)
skf = StratifiedKFold(n_splits=5, shuffle=True,random_state=1)
scores = cross_val_score(model, x, y, cv=5, scoring= "balanced_accuracy")
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test=train_test_split(x,y, stratify=y, test_size=0.20, random_state=2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, stratify=y_train, train_size=0.8, random_state=2)
from keras.utils import to_categorical
y_train=to_categorical(y_train,3)
print('Shape of training lables is:', y_train.shape)
y_val=to_categorical(y_val,3)
print('Shape of validation lables is:', y_val.shape)
y_test=to_categorical(y_test,3)
print('Shape of test labels is:', y_test.shape)
for index, (train_indices, val_indices) in enumerate(skf.split(x, y)):
print ("Training on fold " + str(index+1) + "/" + str(n_splits))
# Generate batches from indices
xtrain, xval = x[train_indices], x[val_indices]
ytrain, yval = y[train_indices], y[val_indices]
ntrain=len(x_train)
nval=len(x_val)
batch_size=32
train_generator=train_datagen.flow(x_train,y_train, batch_size=batch_size)
val_generator=val_datagen.flow(x_val,y_val,batch_size=batch_size)
print ("Training new iteration on " + str(xtrain.shape[0]) + " training samples, " + str(xval.shape[0]) + " validation samples, this may take while...")
history=model.fit(train_generator,
steps_per_epoch=ntrain//batch_size,
epochs=80,
validation_data=(val_generator),
validation_steps=nval//batch_size,
verbose=2)
print('\nBalanced Accuracy:', mterics.balanced_accurcay*100, '%')
when I run the code it gives me the following error:
Traceback (most recent call last)
<ipython-input-7-45c4c9070141> in <module>
6 from keras.preprocessing.image import img_to_array, load_img
7 from sklearn.model_selection import cross_val_score
----> 8 from sklearn.metrics import balanced_accuracy
9
10 #Download the model
ImportError: cannot import name 'balanced_accuracy' from 'sklearn.metrics' (/opt/conda/lib/python3.7/site-packages/sklearn/metrics/__init__.py)
I've tried so many solutions like 1, 2, 3 and 4 but this solution has lead me to another complicated issue because whenever I run the command conda activate myenv it gives me the error:
CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
To initialize your shell, run
$ conda init <SHELL_NAME>
Currently supported shells are:
- bash
- fish
- tcsh
- xonsh
- zsh
- powershell
See 'conda init --help' for more information and options.
IMPORTANT: You may need to close and restart your shell after running 'conda init'.
Tackling this issue by understanding it thoroughly and trying what is provided in this thread lead me to the following error message:
/bin/bash: -c: line 0: syntax error near unexpected token `newline' /bin/bash: -c: line 0: `/opt/conda/bin/conda init <bash>'.
I have tried these solutions, 1 and 2 but had no luck !!
Then, when I got stuck and felt trapped, I tried to follow the conda official documentation to create a virtual environment with all the needed packages but I ran into the same error above stating that my shell is not properly configured to activate the new virtualenv !!
Again, I went back to do the simple solutions by reverting to the first version of my base environment by doing conda list revisions conda install --revision=0 and updating it but the error persists and still preventing me from using balanced_accuracy and other useful metrics.
I also tried to create a new jupyter notebook and start from scratch by updating my packages but they were already updated to the most recent versions !!
I believe that I'm running the set of configuration commands in the wrong order because my jupyter kernel and environment is a complete mess now.
It would be highly appreciated if someone can guide me to the best practices in setting up the environments for a deep learning task.
BTW: the solutions suggested manipulating the bashrc file are a bit confusing to me and I don't fully understand how it works.

ResourceWarning: Implicitly cleaning up <Temporary Directory>

I have the following code, which is called test_build, and it has a test case to save a scikit-learn model along with x_train, y_train and score data, in a tuple object to a ".pkl" file.
from build import *
import os
import pandas as pd
import sklearn
from sklearn import *
import unittest
from sklearn.model_selection import train_test_split
import numpy as np
import tempfile
class TestMachineLearningUtils(unittest.TestCase):
def test_save_model(self):
X, y = np.arange(10).reshape((5,2)), range(5)
model = RandomForestClassifier(n_estimators = 300,
oob_score = True,
n_jobs = -1,
random_state = 123)
X_train, X_test, y_train, y_test = train_test_split(\
X, y, test_size=0.33, random_state=42)
clf = model.fit(X_train, y_train)
score = model.score(X_test, y_test)
dir_path = os.path.dirname(os.path.realpath(__file__))
f = tempfile.TemporaryDirectory(dir = dir_path)
pkl_file_name = f.name + "/" + "pickle_model.pkl"
tuple_objects = (clf, X_train, y_train, score)
path_model = save_model(tuple_objects, pkl_file_name)
exists_model = os.path.exists(path_model)
self.assertExists(exists_model, True)
if __name__ == "__main__":
unittest.main()
This is the content of the save_model function found in the build module I imported in my test file.
def save_model(tuple_objects, model_path):
pickle.dump(tuple_objects, open(model_path), 'wb')
return model_path
The problem I am running to, is that I cannot test if the file is created within a temporal directory. It is apparently created, but it is cleaned after it has been created, from the error message I receive.
C:\Users\User\AppData\Local\Continuum\miniconda3\envs\geoenv\lib\tempfile.py:798: ResourceWarning: Implicitly cleaning up <TemporaryDirectory>
Does anyone knows a solution to this problem? How could one supress the cleaning up of a temporary directory created using the tempfile module in python?
It looks to me like your code does exactly what you want it to do and you just get confused by the warning. The warning is merely telling you that you should explicitly delete the temporary directory, but that the module is so kind as to do it for you.
To delete the temporary directory yourself, either use it in a context manager or call the close method on it.
You simply don't. If you want the directory to outlast the scope you create "not a temporary directory".
Or more likely when testing - you create directory in test setup, fill, test, teardown, so each test is independent.

Graphviz decsion tree

I want to visualize applied decision tree classifier on my dataset and see how it branches out. Doing several google search, this link came up. I am fine with these codes till this line "f = tree.export_graphviz(clf, out_file=f)".
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
with open("iris.dot", 'w') as f:
... f = tree.export_graphviz(clf, out_file=f)
My question is after this code how can i visualize the tree? According to the link "http://scikit-learn.org/stable/modules/tree.html", I have to use this code "dot -Tpdf iris.dot -o iris.pdf" to create a PDF file. I don't understand where should i use this? in the Graphviz’s dot tool? if yes, i get this error "Error: : syntax error in line 1 near 'dot' "
I will be grateful if anybody answer my question.Thanks.
dot -Tpdf iris.dot -o iris.pdf is a command you can use in bash.And you should have Graphviz tools installed.For example,you can install it on ubuntu use command:sudo apt-get install graphviz
According to the link "http://scikit-learn.org/stable/modules/tree.html", if we have Python module pydotplus installed, we can generate a PDF file (or any other supported file type) directly in Python:
import pydotplus
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("iris.pdf")
For Shelly's comment, I add the following code, which is the complete code ran on my ipython notebook.
import pydotplus
from sklearn.datasets import load_iris
from sklearn import tree
iris = load_iris()
clf = tree.DecisionTreeClassifier()
clf = clf.fit(iris.data, iris.target)
with open("iris.dot", 'w') as f:
f = tree.export_graphviz(clf, out_file=f)
dot_data = tree.export_graphviz(clf, out_file=None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("iris.pdf")

Categories