Swift Client with DecisionTreeRegressor - python

I am working with bluemix object storage container, What i want to do that i want to store my "RandomForestRegressor" into a pkl file with joblib. But when i run the code with the Swift client i receives the error.
TypeError: object of type 'DecisionTreeRegressor' has no len()
Here is my code please help.
import os
from flask import Flask,render_template, request,json
from flask.ext.cors import CORS
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import os
from sklearn.externals import joblib
import pickle
import sys
import json
import csv
import swiftclient
app = Flask(__name__)
CORS(app)
cloudant_service = json.loads(os.environ['VCAP_SERVICES'])['Object-Storage'][0]
objectstorage_creds = cloudant_service['credentials']
if objectstorage_creds:
auth_url = objectstorage_creds['auth_url'] + '/v3' #authorization URL
password = objectstorage_creds['password'] #password
project_id = objectstorage_creds['projectId'] #project id
user_id = objectstorage_creds['userId'] #user id
region_name = objectstorage_creds['region'] #region name
def predict_joblib():
conn = swiftclient.Connection(key=password,
authurl=auth_url,
auth_version='3',
os_options={"project_id": project_id,
"user_id": user_id,
"region_name": region_name})
container_name = 'my-container'
# File name for testing
file_name = 'example_file.txt'
# Create a new container
conn.put_container(container_name)
print ("nContainer %s created successfully." % container_name)
# List your containers
print ("nContainer List:")
for container in conn.get_account()[1]:
print (container['name'])
# List objects in a container, and prints out each object name, the file size, and last modified date
print ("nObject List:")
for container in conn.get_account()[1]:
for data in conn.get_container(container['name'])[1]:
print ('object: {0}t size: {1}t date: {2}'.format(data['name'], data['bytes'], data['last_modified']))
print ("-----------LEARN-----------\n")
with open('training_set.json') as json_data:
df_train= pd.read_json(json_data)
train_X = df_train.drop('Price', 1)
train_y = df_train['Price']
print ("Training...")
rfreg = RandomForestRegressor(n_estimators=100, n_jobs=-1)
rfreg.fit(train_X, train_y)
print("\nPerformance on training set:")
print('R^2: %f' % rfreg.score(train_X, train_y))
# print('MSE: %f' % mean_squared_error(rfreg.predict(train_X), train_y))
# print('ABS: %f' % mean_absolute_error(rfreg.predict(train_X), train_y))
importances = rfreg.feature_importances_
std = np.std([tree.feature_importances_ for tree in rfreg.estimators_], axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("\nFeature ranking:")
for f in range(len(importances)):
print("%d. feature %d %s (%f)" % (f + 1, indices[f], df_train.columns[indices[f]], importances[indices[f]]))
# SERIALIZE MODEL USING joblib
print ("Serializing models using joblib...")
conn.put_object(container_name,'v3.pkl', contents= rfreg)
print ("Serializing vectorizers using joblib...")
for feature in ['Fluorescence', 'Culet']:
conn.put_object(container_name,feature+'_v3.pkl', contents= vectorizers[feature])
return rfreg, vectorizers
#app.route('/')
def hello():
predict_joblib()
return 'Welcome to Python Flask!'
#app.route('/signUp')
def signUp():
return 'signUp'
port = os.getenv('PORT', '5000')
if __name__ == "__main__":
app.debug = True
app.run(host='0.0.0.0', port=int(port))

Related

Unable to successfully patch functions of Azure ContainerClient

I have been trying to patch the list_blobs() function of ContainerClient, have not been able to do this successfully, this code outputs a MagicMock() function - but the function isn't patched as I would expect it to be (Trying to patch with a list ['Blob1', 'Blob2'].
#################Script File
import sys
from datetime import datetime, timedelta
import pyspark
import pytz
import yaml
# from azure.storage.blob import BlobServiceClient, ContainerClient
from pyspark.dbutils import DBUtils as dbutils
import azure.storage.blob
# Open Config
def main():
spark_context = pyspark.SparkContext.getOrCreate()
spark_context.addFile(sys.argv[1])
stream = None
stream = open(sys.argv[1], "r")
config = yaml.load(stream, Loader=yaml.FullLoader)
stream.close()
account_key = dbutils.secrets.get(scope=config["Secrets"]["Scope"], key=config["Secrets"]["Key Name"])
target_container = config["Storage Configuration"]["Container"]
target_account = config["Storage Configuration"]["Account"]
days_history_to_keep = config["Storage Configuration"]["Days History To Keep"]
connection_string = (
"DefaultEndpointsProtocol=https;AccountName="
+ target_account
+ ";AccountKey="
+ account_key
+ ";EndpointSuffix=core.windows.net"
)
blob_service_client: azure.storage.blob.BlobServiceClient = (
azure.storage.blob.BlobServiceClient.from_connection_string(connection_string)
)
container_client: azure.storage.blob.ContainerClient = (
blob_service_client.get_container_client(target_container)
)
blobs = container_client.list_blobs()
print(blobs)
print(blobs)
utc = pytz.UTC
delete_before_date = utc.localize(
datetime.today() - timedelta(days=days_history_to_keep)
)
for blob in blobs:
if blob.creation_time < delete_before_date:
print("Deleting Blob: " + blob.name)
container_client.delete_blob(blob, delete_snapshots="include")
if __name__ == "__main__":
main()
#################Test File
import unittest
from unittest import mock
import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
#mock.patch("DeleteOldBlobs.azure.storage.blob.ContainerClient")
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_main(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient, mock_containerclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
mock_dbutils.secrets.get.return_value = "A Secret"
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
# execute test
DeleteOldBlobs.main()
# TODO assert actions taken
# mock_sys.argv.__get__.assert_called_with()
# dbutils.secrets.get(scope=config['Secrets']['Scope'], key=config['Secrets']['Key Name'])
if __name__ == "__main__":
unittest.main()
Output:
<MagicMock name='BlobServiceClient.from_connection_string().get_container_client().list_blobs()' id='1143355577232'>
What am I doing incorrectly here?
I'm not able to execute your code in this moment, but I have tried to simulate it. To do this I have created the following 3 files in the path: /<path-to>/pkg/sub_pkg1 (where pkg and sub_pkg1 are packages).
File ContainerClient.py
def list_blobs(self):
return "blob1"
File DeleteOldBlobs.py
from pkg.sub_pkg1 import ContainerClient
# Open Config
def main():
blobs = ContainerClient.list_blobs()
print(blobs)
print(blobs)
File DeleteBlobsTest.py
import unittest
from unittest import mock
from pkg.sub_pkg1 import DeleteOldBlobs
class DeleteBlobsTest(unittest.TestCase):
def setUp(self):
pass
def test_main(self):
mock_containerclient = mock.MagicMock()
with mock.patch("DeleteOldBlobs.ContainerClient.list_blobs", mock_containerclient.list_blobs):
mock_containerclient.list_blobs.return_value = ["ablob1", "ablob2"]
DeleteOldBlobs.main()
if __name__ == '__main__':
unittest.main()
If you execute the test code you obtain the output:
['ablob1', 'ablob2']
['ablob1', 'ablob2']
This output means that the function list_blobs() is mocked by mock_containerclient.list_blobs.
I don't know if the content of this post can be useful for you, but I'm not able to simulate better your code in this moment.
I hope you can inspire to my code to find your real solution.
The structure of the answer didn't match my solution, perhaps both will work but it was important for me to patch pyspark even though i never call it, or exceptions would get thrown when my code tried to interact with spark.
Perhaps this will be useful to someone:
#mock.patch("DeleteOldBlobs.azure.storage.blob.BlobServiceClient")
#mock.patch("DeleteOldBlobs.dbutils")
#mock.patch("DeleteOldBlobs.sys")
#mock.patch('DeleteOldBlobs.pyspark')
def test_list_blobs_called_once(self, mock_pyspark, mock_sys, mock_dbutils, mock_blobserviceclient):
# mock setup
config_file = "Delete_Old_Blobs_UnitTest.yml"
mock_sys.argv = ["unused_arg", config_file]
account_key = 'Secret Key'
mock_dbutils.secrets.get.return_value = account_key
bsc_mock: mock.Mock = mock.Mock()
container_client_mock = mock.Mock()
blob1 = Blob('newblob', datetime.today())
blob2 = Blob('oldfile', datetime.today() - timedelta(days=20))
container_client_mock.list_blobs.return_value = [blob1, blob2]
bsc_mock.get_container_client.return_value = container_client_mock
mock_blobserviceclient.from_connection_string.return_value = bsc_mock
# execute test
DeleteOldBlobs.main()
#Assert Results
container_client_mock.list_blobs.assert_called_once()

Udacity Self Driving Car Simulator

I am working on Udacity's self-driving car simulator. I am facing a problem in this when I run the drive.py file with my model as argument model.h5 nothing happens in the simulator.
The model has been trained completely without any errors but still, there is no response from the simulator.
Here is the drive.py python code and a link to the video to show what is actually happening
drive.py
import argparse
import base64
from datetime import datetime
import os
import shutil
import numpy as np
import socketio
import eventlet
import eventlet.wsgi
from PIL import Image
from flask import Flask
from io import BytesIO
from keras.models import load_model
import h5py
from keras import __version__ as keras_version
sio = socketio.Server()
app = Flask(__name__)
model = None
prev_image_array = None
class SimplePIController:
def __init__(self, Kp, Ki):
self.Kp = Kp
self.Ki = Ki
self.set_point = 0.
self.error = 0.
self.integral = 0.
def set_desired(self, desired):
self.set_point = desired
def update(self, measurement):
# proportional error
self.error = self.set_point - measurement
# integral error
self.integral += self.error
return self.Kp * self.error + self.Ki * self.integral
controller = SimplePIController(0.1, 0.002)
set_speed = 30
controller.set_desired(set_speed)
def crop_image(img, img_height=75, img_width=200):
height = img.shape[0]
width = img.shape[1]
y_start = 60
#x_start = int(width/2)-int(img_width/2)
return img[y_start:y_start+img_height, 0:width ]#x_start:x_start+img_width]
#sio.on('telemetry')
def telemetry(sid, data):
if data:
# The current steering angle of the car
steering_angle = data["steering_angle"]
# The current throttle of the car
throttle = data["throttle"]
# The current speed of the car
speed = data["speed"]
# The current image from the center camera of the car
imgString = data["image"]
image = Image.open(BytesIO(base64.b64decode(imgString)))
image_array = np.asarray(image)
image_array = crop_image(image_array)
steering_angle = float(model.predict(image_array[None, :, :, :], batch_size=1))
throttle = controller.update(float(speed))
print(steering_angle, throttle)
send_control(steering_angle, throttle)
# save frame
if args.image_folder != '':
timestamp = datetime.utcnow().strftime('%Y_%m_%d_%H_%M_%S_%f')[:-3]
image_filename = os.path.join(args.image_folder, timestamp)
image.save('{}.jpg'.format(image_filename))
else:
# NOTE: DON'T EDIT THIS.
sio.emit('manual', data={}, skip_sid=True)
#sio.on('connect')
def connect(sid, environ):
print("connect ", sid)
send_control(0, 0)
def send_control(steering_angle, throttle):
sio.emit(
"steer",
data={
'steering_angle': steering_angle.__str__(),
'throttle': throttle.__str__()
},
skip_sid=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Remote Driving')
parser.add_argument(
'model',
type=str,
help='Path to model h5 file. Model should be on the same path.'
)
parser.add_argument(
'image_folder',
type=str,
nargs='?',
default='',
help='Path to image folder. This is where the images from the run will be saved.'
)
args = parser.parse_args()
# check that model Keras version is same as local Keras version
f = h5py.File(args.model, mode='r')
model_version = f.attrs.get('keras_version')
keras_version = str(keras_version).encode('utf8')
if model_version != keras_version:
print('You are using Keras version ', keras_version,
', but the model was built using ', model_version)
model = load_model(args.model)
if args.image_folder != '':
print("Creating image folder at {}".format(args.image_folder))
if not os.path.exists(args.image_folder):
os.makedirs(args.image_folder)
else:
shutil.rmtree(args.image_folder)
os.makedirs(args.image_folder)
print("RECORDING THIS RUN ...")
else:
print("NOT RECORDING THIS RUN ...")
# wrap Flask application with engineio's middleware
app = socketio.Middleware(sio, app)
# deploy as an eventlet WSGI server
eventlet.wsgi.server(eventlet.listen(('', 4567)), app)
problem video link
https://youtu.be/nP8WH8pM29Q
This is due to the socketio version. Use 4.2.1, that should fix your problem

Infinite loop When I import sentence_transformers in FastAPI

I try to serve STS model by FastAPI framework, but when I import 'from sentence_transformers import SentenceTransformer', It infinitly loops. I want to get pred in content.py and post it to 'predicts/' in main.py.
# main.py
from fastapi import FastAPI
from fastapi import File
import torch
from pydantic import BaseModel
from content import predict_model
app = FastAPI()
class Item(BaseModel):
sentence_1: str
sentence_2: str
#app.post("/predicts")
async def predict(item:Item):
predict_model()
return {}
# content.py
import torch
from sentence_transformers import SentenceTransformer
def cosine_similarity_manual(x, y, small_number=1e-8):
result = torch.dot(x, y) / (torch.linalg.norm(x) * torch.linalg.norm(y) + small_number)
return result
def predict_model():
sent1 = '무엇보다도 호스트분들이 너무 친절하셨습니다.'
sent2 = '무엇보다도, 호스트들은 매우 친절했습니다.'
predict = 0
texts = [sent1, sent2]
model_path = "training_sts-Huffon-sentence-klue-roberta-base"
model = SentenceTransformer(model_path)
corpus_embeddings = model.encode(texts[0], convert_to_tensor=True)
query_embeddings = model.encode(texts[1], convert_to_tensor=True)
print(corpus_embeddings.shape)
print(query_embeddings.shape)
score = cosine_similarity_manual(corpus_embeddings,query_embeddings)
print(score)
if score >= 0.6:
pred = 1
else:
pred = 0
print(pred)```

how do i reduce the loading time of a pre-trained model?

While loading the weights of "Imagenet" using ResNet50 it nearly takes 10-11sec each time while loading the weights.
Is there any way to reduces the loading time ?
Code:
from flask import Flask, render_template, request
from werkzeug import secure_filename
from flask import request,Flask
import json
import os
import time
from keras.preprocessing import image as image_util
from keras.applications.imagenet_utils import preprocess_input
from keras.applications.imagenet_utils import decode_predictions
# from keras.applications import ResNet50
from keras.applications.inception_v3 import InceptionV3
import numpy as np
app = Flask(__name__)
#app.route('/object_rec', methods=['POST'])
def object_rec():
f = request.files['file']
file_path = ("./upload/"+secure_filename(f.filename))
f.save(file_path)
image = image_util.load_img(file_path,target_size=(299,299))
image = image_util.img_to_array(image)
image = np.expand_dims(image,axis=0) #(224,224,3) --> (1,224,224,3)
image = preprocess_input(image)
start_time = time.time()
model = InceptionV3(weights="imagenet")
pred = model.predict(image)
p = decode_predictions(pred)
ans = p[0][0]
acc = ans[2]
acc = str(acc)
if ans[1] == "Granny_Smith":
ans = ans[1]
ans = 'Apple'
else:
ans = ans[1]
print("THE PREDICTED IMAGE IS: "+ans)
print("THE ACCURACY IS: "+acc)
print("--- %s seconds ---" % (time.time() - start_time))
result = {
"status": True,
"object": ans,
"score":acc
}
result = json.dumps(result)
return result
if __name__ == '__main__':
app.run(host='0.0.0.0',port=6000,debug=True)
time taken would differ between 8-11 sec.
I would be good if it loads the model in 3-4sec and does classification.
Thanks in advance
The way you can do it, is to load the model in a specific session and then every time you want to use the model just set that specific session, then just call predict where you need it:
app = Flask(__name__)
sess = tf.Session(config=tf_config)
graph = tf.get_default_graph()
# IMPORTANT: models have to be loaded AFTER SETTING THE SESSION for keras!
# Otherwise, their weights will be unavailable in the threads after the
session there has been set
set_session(sess)
model = InceptionV3(weights="imagenet")
#app.route('/object_rec', methods=['POST'])
def object_rec():
global sess
global graph
with graph.as_default():
set_session(sess)
model.predict(...)
if __name__ == '__main__':
app.run(host='0.0.0.0',port=6000,debug=True)

Sklearn classifier and flask issues

I have been trying to self host with apache an sklearn classifier that I put together, and I ended up using joblib to serialize the saved model, then load it in a flask app. Now, this app worked perfectly when running flask's built in development server, but when I set this up with a debian 9 apache server, I get a 500 error. Delving into apache's error.log, I get:
AttributeError: module '__main__' has no attribute 'tokenize'
Now, this is funny to me because while I did write my own tokenizer, the web app gave me no problems when I was running it locally. Furthermore, the saved model that I used was trained on the webserver, so slightly different library versions should not be a problem.
My code for the web app is:
import re
import sys
from flask import Flask, request, render_template
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.externals import joblib
app = Flask(__name__)
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
#app.route('/')
def home():
return render_template('home.html')
#app.route('/analyze',methods=['POST','GET'])
def analyze():
if request.method=='POST':
result=request.form
input_text = result['input_text']
clf = joblib.load("model.pkl.z")
parameters = clf.named_steps['clf'].get_params()
predicted = clf.predict([input_text])
# print(predicted)
certainty = clf.decision_function([input_text])
# Is it bonkers?
if predicted[0]:
verdict = "Not too nuts!"
else:
verdict = "Bonkers!"
return render_template('result.html',prediction=[input_text, verdict, float(certainty), parameters])
if __name__ == '__main__':
#app.debug = True
app.run()
With the .wsgi file being:
import sys
sys.path.append('/var/www/mysite')
from conspiracydetector import app as application
Furthermore, I trained the model with this code:
import logging
import pprint # Pretty stuff
import re
import sys # For command line arguments
from time import time # to show progress
import numpy as np
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import metrics
from sklearn.datasets import load_files
from sklearn.externals import joblib # In order to save
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
# Tokenizer that does stemming and strips punctuation
def tokenize(text):
# text = text.translate(str.maketrans('','',string.punctuation))
text = re.sub(r'\W+', ' ', text)
tokens = word_tokenize(text)
lemas = []
for item in tokens:
lemas.append(WordNetLemmatizer().lemmatize(item))
return lemas
if __name__ == "__main__":
# NOTE: we put the following in a 'if __name__ == "__main__"' protected
# block to be able to use a multi-core grid search that also works under
# Windows, see: http://docs.python.org/library/multiprocessing.html#windows
# The multiprocessing module is used as the backend of joblib.Parallel
# that is used when n_jobs != 1 in GridSearchCV
# Display progress logs on stdout
print("Initializing...")
# Command line arguments
save = sys.argv[1]
training_directory = sys.argv[2]
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s')
dataset = load_files(training_directory, shuffle=False)
print("n_samples: %d" % len(dataset.data))
# split the dataset in training and test set:
print("Splitting the dataset in training and test set...")
docs_train, docs_test, y_train, y_test = train_test_split(
dataset.data, dataset.target, test_size=0.25, random_state=None)
# Build a vectorizer / classifier pipeline that filters out tokens
# that are too rare or too frequent
# Also remove stop words
print("Loading list of stop words...")
with open('stopwords.txt', 'r') as f:
words = [line.strip() for line in f]
print("Stop words list loaded...")
print("Setting up pipeline...")
pipeline = Pipeline(
[
# ('vect', TfidfVectorizer(stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1,1))),
('vect',
TfidfVectorizer(tokenizer=tokenize, stop_words=words, min_df=0.001, max_df=0.5, ngram_range=(1, 1))),
('clf', LinearSVC(C=5000)),
])
print("Pipeline:", [name for name, _ in pipeline.steps])
# Build a grid search to find out whether unigrams or bigrams are
# more useful.
# Fit the pipeline on the training set using grid search for the parameters
print("Initializing grid search...")
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
# 'vect__ngram_range': [(1, 1), (1, 2)],
# 'vect__min_df': (0.0005, 0.001),
# 'vect__max_df': (0.25, 0.5),
# 'clf__C': (10, 15, 20),
}
print("Parameters:")
pprint.pprint(parameters)
grid_search = GridSearchCV(
pipeline,
parameters,
n_jobs=-1,
verbose=True)
print("Training and performing grid search...\n")
t0 = time()
grid_search.fit(docs_train, y_train)
print("\nDone in %0.3fs!\n" % (time() - t0))
# Print the mean and std for each candidate along with the parameter
# settings for all the candidates explored by grid search.
n_candidates = len(grid_search.cv_results_['params'])
for i in range(n_candidates):
print(i, 'params - %s; mean - %0.2f; std - %0.2f'
% (grid_search.cv_results_['params'][i],
grid_search.cv_results_['mean_test_score'][i],
grid_search.cv_results_['std_test_score'][i]))
# Predict the outcome on the testing set and store it in a variable
# named y_predicted
print("\nRunning against testing set...\n")
y_predicted = grid_search.predict(docs_test)
# Save model
print("\nSaving model to", save, "...")
joblib.dump(grid_search.best_estimator_, save)
print("Model Saved! \nPrepare for some awesome stats!")
I must confess that I am pretty stumped, and after tinkering around, searching, and making sure that my server is configured correctly, I felt that perhaps someone here might be able to help.
Any help is appreciated, and if there is any more information that I need to provide, please let me know and I will be happy to.
Also, I am running:
python 3.5.3 with nltk and sklearn.
I solved this problem, although imperfectly, by removing my custom tokenizer and falling back on one of sklearn's.
However, I am still in the dark on how to integrate my own tokenizer.

Categories