Deploying Keras model to Google Cloud ML for serving predictions - deployment
I need to understand how to deploy models on Google Cloud ML. My first task is to deploy a very simple text classifier on the service. I do it in the following steps (could perhaps be shortened to fewer steps, if so, feel free to let me know):
Define the model using Keras and export to YAML
Load up YAML and export as a Tensorflow SavedModel
Upload model to Google Cloud Storage
Deploy model from storage to Google Cloud ML
Set the upload model version as default on the models website.
Run model with a sample input
I've finally made step 1-5 work, but now I get this strange error seen below when running the model. Can anyone help? Details on the steps is below. Hopefully, it can also help others that are stuck on one of the previous steps. My model works fine locally.
I've seen Deploying Keras Models via Google Cloud ML and Export a basic Tensorflow model to Google Cloud ML, but they seem to be stuck on other steps of the process.
Error
Prediction failed: Exception during model execution: AbortionError(code=StatusCode.INVALID_ARGUMENT, details="In[0] is not a matrix
[[Node: MatMul = MatMul[T=DT_FLOAT, _output_shapes=[[-1,64]], transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/cpu:0"](Mean, softmax_W/read)]]")
Step 1
# import necessary classes from Keras..
model_input = Input(shape=(maxlen,), dtype='int32')
embed = Embedding(input_dim=nb_tokens,
output_dim=256,
mask_zero=False,
input_length=maxlen,
name='embedding')
x = embed(model_input)
x = GlobalAveragePooling1D()(x)
outputs = [Dense(nb_classes, activation='softmax', name='softmax')(x)]
model = Model(input=[model_input], output=outputs, name="fasttext")
# export to YAML..
Step 2
from __future__ import print_function
import sys
import os
import tensorflow as tf
from tensorflow.contrib.session_bundle import exporter
import keras
from keras import backend as K
from keras.models import model_from_config, model_from_yaml
from optparse import OptionParser
EXPORT_VERSION = 1 # for us to keep track of different model versions (integer)
def export_model(model_def, model_weights, export_path):
with tf.Session() as sess:
init_op = tf.global_variables_initializer()
sess.run(init_op)
K.set_learning_phase(0) # all new operations will be in test mode from now on
yaml_file = open(model_def, 'r')
yaml_string = yaml_file.read()
yaml_file.close()
model = model_from_yaml(yaml_string)
# force initialization
model.compile(loss='categorical_crossentropy',
optimizer='adam')
Wsave = model.get_weights()
model.set_weights(Wsave)
# weights are not loaded as I'm just testing, not really deploying
# model.load_weights(model_weights)
print(model.input)
print(model.output)
pred_node_names = output_node_names = 'Softmax:0'
num_output = 1
export_path_base = export_path
export_path = os.path.join(
tf.compat.as_bytes(export_path_base),
tf.compat.as_bytes('initial'))
builder = tf.saved_model.builder.SavedModelBuilder(export_path)
# Build the signature_def_map.
x = model.input
y = model.output
values, indices = tf.nn.top_k(y, 5)
table = tf.contrib.lookup.index_to_string_table_from_tensor(tf.constant([str(i) for i in xrange(5)]))
prediction_classes = table.lookup(tf.to_int64(indices))
classification_inputs = tf.saved_model.utils.build_tensor_info(model.input)
classification_outputs_classes = tf.saved_model.utils.build_tensor_info(prediction_classes)
classification_outputs_scores = tf.saved_model.utils.build_tensor_info(values)
classification_signature = (
tf.saved_model.signature_def_utils.build_signature_def(inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: classification_inputs},
outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: classification_outputs_classes, tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES: classification_outputs_scores},
method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))
tensor_info_x = tf.saved_model.utils.build_tensor_info(x)
tensor_info_y = tf.saved_model.utils.build_tensor_info(y)
prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(
inputs={'images': tensor_info_x},
outputs={'scores': tensor_info_y},
method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={'predict_images': prediction_signature,
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: classification_signature,},
legacy_init_op=legacy_init_op)
builder.save()
print('Done exporting!')
raise SystemExit
if __name__ == '__main__':
usage = "usage: %prog [options] arg"
parser = OptionParser(usage)
(options, args) = parser.parse_args()
if len(args) < 3:
raise ValueError("Too few arguments!")
model_def = args[0]
model_weights = args[1]
export_path = args[2]
export_model(model_def, model_weights, export_path)
Step 3
gsutil cp -r fasttext_cloud/ gs://quiet-notch-xyz.appspot.com
Step 4
from __future__ import print_function
from oauth2client.client import GoogleCredentials
from googleapiclient import discovery
from googleapiclient import errors
import time
projectID = 'projects/{}'.format('quiet-notch-xyz')
modelName = 'fasttext'
modelID = '{}/models/{}'.format(projectID, modelName)
versionName = 'Initial'
versionDescription = 'Initial release.'
trainedModelLocation = 'gs://quiet-notch-xyz.appspot.com/fasttext/'
credentials = GoogleCredentials.get_application_default()
ml = discovery.build('ml', 'v1', credentials=credentials)
# Create a dictionary with the fields from the request body.
requestDict = {'name': modelName, 'description': 'Online predictions.'}
# Create a request to call projects.models.create.
request = ml.projects().models().create(parent=projectID, body=requestDict)
# Make the call.
try:
response = request.execute()
except errors.HttpError as err:
# Something went wrong, print out some information.
print('There was an error creating the model.' +
' Check the details:')
print(err._get_reason())
# Clear the response for next time.
response = None
raise
time.sleep(10)
requestDict = {'name': versionName,
'description': versionDescription,
'deploymentUri': trainedModelLocation}
# Create a request to call projects.models.versions.create
request = ml.projects().models().versions().create(parent=modelID,
body=requestDict)
# Make the call.
try:
print("Creating model setup..", end=' ')
response = request.execute()
# Get the operation name.
operationID = response['name']
print('Done.')
except errors.HttpError as err:
# Something went wrong, print out some information.
print('There was an error creating the version.' +
' Check the details:')
print(err._get_reason())
raise
done = False
request = ml.projects().operations().get(name=operationID)
print("Adding model from storage..", end=' ')
while (not done):
response = None
# Wait for 10000 milliseconds.
time.sleep(10)
# Make the next call.
try:
response = request.execute()
# Check for finish.
done = True # response.get('done', False)
except errors.HttpError as err:
# Something went wrong, print out some information.
print('There was an error getting the operation.' +
'Check the details:')
print(err._get_reason())
done = True
raise
print("Done.")
Step 5
Use website.
Step 6
def predict_json(instances, project='quiet-notch-xyz', model='fasttext', version=None):
"""Send json data to a deployed model for prediction.
Args:
project (str): project where the Cloud ML Engine Model is deployed.
model (str): model name.
instances ([Mapping[str: Any]]): Keys should be the names of Tensors
your deployed model expects as inputs. Values should be datatypes
convertible to Tensors, or (potentially nested) lists of datatypes
convertible to tensors.
version: str, version of the model to target.
Returns:
Mapping[str: any]: dictionary of prediction results defined by the
model.
"""
# Create the ML Engine service object.
# To authenticate set the environment variable
# GOOGLE_APPLICATION_CREDENTIALS=<path_to_service_account_file>
service = googleapiclient.discovery.build('ml', 'v1')
name = 'projects/{}/models/{}'.format(project, model)
if version is not None:
name += '/versions/{}'.format(version)
response = service.projects().predict(
name=name,
body={'instances': instances}
).execute()
if 'error' in response:
raise RuntimeError(response['error'])
return response['predictions']
Then run function with test input: predict_json({'inputs':[[18, 87, 13, 589, 0]]})
There is now a sample demonstrating the use of Keras on CloudML engine, including prediction. You can find the sample here:
https://github.com/GoogleCloudPlatform/cloudml-samples/tree/master/census/keras
I would suggest comparing your code to that code.
Some additional suggestions that will still be relevant:
CloudML Engine currently only supports using a single signature (the default signature). Looking at your code, I think prediction_signature is more likely to lead to success, but you haven't made that the default signature. I suggest the following:
builder.add_meta_graph_and_variables(
sess, [tf.saved_model.tag_constants.SERVING],
signature_def_map={tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature,},
legacy_init_op=legacy_init_op)
If you are deploying to the service, then you would invoke prediction like so:
predict_json({'images':[[18, 87, 13, 589, 0]]})
If you are testing locally using gcloud ml-engine local predict --json-instances the input data is slightly different (matches that of the batch prediction service). Each newline-separated line looks like this (showing a file with two lines):
{'images':[[18, 87, 13, 589, 0]]}
{'images':[[21, 85, 13, 100, 1]]}
I don't actually know enough about the shape of model.x to ensure the data being sent is correct for your model.
By way of explanation, it may be insightful to consider the difference between the Classification and Prediction methods in SavedModel. One difference is that, when using tensorflow_serving, which is based on gRPC, which is strongly typed, Classification provides a strongly-typed signature that most classifiers can use. Then you can reuse the same client on any classifier.
That's not overly useful when using JSON since JSON isn't strongly typed.
One other difference is that, when using tensorflow_serving, Prediction accepts column-based inputs (a map from feature name to every value for that feature in the whole batch) whereas Classification accepts row based inputs (each input instance/example is a row).
CloudML abstracts that away a bit and always requires row-based inputs (a list of instances). We even though we only officially support Prediction, but Classification should work as well.
Related
how to use a pytest function to test different site using a different set of test data for each site such as staging/production
I have a set of pytest functions to test APIs, and test data is in a json file loaded by the pytest.mark.parametrize. Because the staging, production, and pre_production have different data but are similar, I want to save the test data in a different folder and use the same file name, in order to keep the python function clean. Site information is a new option from the command line of pytest. It doesn't work, pytest.mark.parametrize can't get the right folder to collect the test data. This is in the conftest.py #pytest.fixture(autouse=True) def setup(request, site): request.cls.site = site yield def pytest_addoption(parser): parser.addoption("--site", action="store", default="staging") #pytest.fixture(scope="session", autouse=True) def site(request): return request.config.getoption("--site") This is in the test cases file: #pytest.mark.usefixtures("setup") class TestAAA: #pytest.fixture(autouse=True) def class_setup(self): self.endpoint = read_data_from_file("endpoint.json")["AAA"][self.site] if self.site == "production": self.test_data_folder = "SourcesV2/production/" else: // staging self.test_data_folder = "SourcesV2/" testdata.set_data_folder(self.test_data_folder) #pytest.mark.parametrize("test_data", testdata.read_data_from_json_file(r"get_source_information.json")) def test_get_source_information(self, test_data): request_url = self.endpoint + f"/AAA/sources/{test_data['sourceID']}" response = requests.get(request_url) print(response) I can use pytest.skip to skip the test data which is not for the current site. if test_data["site"] != self.site: pytest.skip("this test case is for " + test_data["site"] + ", skiping...") But it will need to put all the test data in one file for staging/production/pre-production, and there will be a lot of skipped tests in the report, which is not my favorite. Do you have any idea to solve this? How to pass a different file name to the parametrize according to the site? Or, at least, how to let the skipped test not write logs in the report? Thanks
The parametrize decorator is evaluated at load time, not at run time, so you will not be able to use it directly for this. You need to do the parametrization at runtime instead. This can be done using the pytest_generate_tests hook: def pytest_generate_tests(metafunc): if "test_data" in metafunc.fixturenames: site = metafunc.config.getoption("--site") if site == "production": test_data_folder = "SourcesV2/production" else: test_data_folder = "SourcesV2" # this is just for illustration, your test data may be loaded differently with open(os.path.join(test_data_folder, "test_data.json")) as f: test_data = json.load(f) metafunc.parametrize("test_data", test_data) class TestAAA: def test_get_source_information(self, test_data): ... If loading the test data is expansive, you could also cache it to avoid reading it for each test.
Running blenderbot-3B model locally does not provide same result as on Inference API
I tried the facebook/blenderbot-3B model using the Hosted Inference API and it works pretty well (https://huggingface.co/facebook/blenderbot-3B). Now I tried to use it locally with the Python script shown below. The created responses are much worse than from the inference API and do not make sense most of the time. Is a different code used for the inference API or did I make a mistake? from transformers import TFAutoModelForCausalLM, AutoTokenizer, BlenderbotTokenizer, TFBlenderbotForConditionalGeneration, TFT5ForConditionalGeneration, BlenderbotTokenizer, BlenderbotForConditionalGeneration import tensorflow as tf import torch device = "cuda:0" if torch.cuda.is_available() else "cpu" chat_bots = { 'BlenderBot': [BlenderbotTokenizer.from_pretrained("hyunwoongko/blenderbot-9B"), BlenderbotForConditionalGeneration.from_pretrained("hyunwoongko/blenderbot-9B").to(device)], } key = 'BlenderBot' tokenizer, model = chat_bots[key] for step in range(100): new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt').to(device) if step > 0: bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) else: bot_input_ids = new_user_input_ids chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id).to(device) print("Bot: ", tokenizer.batch_decode(chat_history_ids, skip_special_tokens=True)[0])
Dash app connections to AWS postgres DB VERY SLOW
I've created a live-updating dash app connected to a public facing AWS Postgres database. I've put db connection within my callback so it updates, but I find that it takes a long long time to retrieve data and create the graph, such that if the interval time is reduced to 10 seconds or less, no graph loads at all. I've tried to store the data in dcc.store but the initial load still takes a very long time. My abbreviated code is written below. I'm assuming the lag time is from the engine connecting to the database, because I am only reading a few rows and columns. Is there anyway to speed this up? import plotly.graph_objs as go import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Input, Output, State from plotly.subplots import make_subplots from sqlalchemy import create_engine, MetaData, Table from sqlalchemy.ext.automap import automap_base from sqlalchemy.orm import declarative_base from sqlalchemy import Column, Integer, String, func, Date, ARRAY from sqlalchemy.orm import sessionmaker app = dash.Dash(__name__, external_stylesheets=[BS], suppress_callback_exceptions=True, update_title=None) server=app.server app.layout = html.Div([ dcc.Store(id='time', storage_type='session'), dcc.Store(id='blood_pressure', storage_type='session'), html.Div(dcc.Graph(id='live-graph', animate=False), className='w-100'), html.Div(id= "testing"), dcc.Interval( id='graph-update-BP', interval=30000, n_intervals=0 )]), width={"size": 10, "offset": 0.5}), #app.callback( dash.dependencies.Output('live-graph', 'figure'), dash.dependencies.Output('blood_pressure', 'data'), dash.dependencies.Output('time', 'data'), [dash.dependencies.Input('graph-update-BP', 'n_intervals')], Input('live-graph', 'relayoutData'), ) def update_graph_scatter_1(n): trace = [] blood_pressure = [] time = [] engine = create_engine("postgresql://username:password#address:5432/xxxxx", echo=True, future=True) Session = sessionmaker(bind=engine) session = Session() Base = automap_base() Base.prepare(engine, reflect=True) User = Base.classes.users Datex = Base.classes.data for instance in session.query(Datex).filter(Datex.user_id == 3).filter(Datex.date_time == 'Monday,Apr:26'): blood_pressure.append([instance.systolic, instance.mean, instance.diastolic]) time.append(instance.time) for i in range(0, len(blood_pressure)): trace.append(go.Box(y=blood_pressure[i], x=time[i], line=dict(color='#6a92ff'), hoverinfo='all')) fig = make_subplots(rows=1, cols=1) def append_trace(): for i in range(0, len(trace)): fig.append_trace(trace[i], 1, 1) append_trace() return fig, blood_pressure, hr,
You can increase performance in your app in the following ways: Non-programming methods: If your app is deployed on AWS, ensure your app is connecting to your database over private IP. This reduces the number of networks your data has to traverse and will result in significantly lower latency. Ensure your virtual machine has enough RAM. (If you're loading 2GB of data to a machine with 1GB available RAM, you're going to see the IO hit disk before loading to your program.) Programming methods: Modularize connecting to your database, and only do it once. This decreases the overhead required to reserve resources and authenticate connecting to the database import os class DbConnection: """Use this class to connect to your database within a dashapp""" def __init__(self, **kwargs): self.DB_URI = os.environ.get('DB_URI', kwargs.get('DB_URI')) self.echo = kwargs.get('echo', True) self.future = kwargs.get('future', True) # Now create the engine self.engine = create_engine(self.DB_URI, echo=self.echo, future=self.self) # Make the session maker self.session_maker = sessionmaker(bind=self.engine) #property def session(self): """Return a session as a property""" return self.session_maker() # ------------------------------------------- # In your app, instantiate the database connection # and map your base my_db_connection = DbConnection() # provide kwargs as needed session = my_db_connection.session # necessary to assign property to a variable # Map the classes Base = automap_base() Base.prepare(my_db_connection.engine, reflect=True) User = Base.classes.users Datex = Base.classes.data Cache frequently queried data. Unless your data is massive and dramatically varying, you should expect better performance from loading the data from disk (or RAM) on your machine, than over the network from your database. from functools import lru_cache #lru_cache() def get_blood_pressure(session, user_id, date): """returns blood pressure for a given user for a given date""" blood_pressure, time = [], [] query = session.query(Datex)\ .filter(Datex.user_id == 3)\ .filter(Datex.date_time == 'Monday,Apr:26') # I like short variable names when interacting with db results for rec in query: time.append(rec.time) blood_pressure.append([rec.systolic, rec.mean, rec.diastolic]) # finally return blood_pressure, time Putting them all together, your callback should be a lot quicker def update_graph_scatter_1(n): # I'm not sure how these variables will be assigned # but you'll figure it out blood_pressure, time = get_blood_pressure(session=session, user_id=user_id, date='Monday,Apr:26') # Create new traces for i in range(0, len(blood_pressure)): trace.append(go.Box( y=blood_pressure[i], x=time[i], line=dict(color='#6a92ff'), hoverinfo='all' )) # Add to subplots fig = make_subplots(rows=1, cols=1) for i in range(0, len(trace)): fig.append_trace(trace[i], 1, 1) return fig, blood_pressure, time Lastly, it looks like you're recreating your graph objects each update. This is a heavy operation. I'd recommend updating the graph's data instead. I know this is possible, since I've done this in the past. But it looks like the solution is not-trivial, unfortunately. Perhaps an item for a later response or follow up Q. Further reading: https://dash.plotly.com/performance
Gcloud ai-platform, can't create model with own prediction-class
I try following AI Platform tutorial to upload a model and a prediction routine but one part fail and I don't understand why. My prediction class is the same as in their tutorial: %%writefile predictor.py import os import pickle import numpy as np from sklearn.datasets import load_iris from sklearn.externals import joblib class MyPredictor(object): def __init__(self, model, preprocessor): self._model = model self._preprocessor = preprocessor self._class_names = load_iris().target_names def predict(self, instances, **kwargs): inputs = np.asarray(instances) preprocessed_inputs = self._preprocessor.preprocess(inputs) if kwargs.get('probabilities'): probabilities = self._model.predict_proba(preprocessed_inputs) return probabilities.tolist() else: outputs = self._model.predict(preprocessed_inputs) return [self._class_names[class_num] for class_num in outputs] #classmethod def from_path(cls, model_dir): model_path = os.path.join(model_dir, 'model.joblib') model = joblib.load(model_path) preprocessor_path = os.path.join(model_dir, 'preprocessor.pkl') with open(preprocessor_path, 'rb') as f: preprocessor = pickle.load(f) return cls(model, preprocessor) the code I use to create my model in cloud is: ! gcloud beta ai-platform versions create {VERSION_NAME} \ --model {MODEL_NAME} \ --runtime-version 1.13 \ --python-version 3.5 \ --origin gs://{BUCKET_NAME}/custom_prediction_routine_tutorial/model/ \ --package-uris gs://{BUCKET_NAME}/custom_prediction_routine_tutorial/my_custom_code-0.1.tar.gz \ --prediction-class predictor.MyPredictor But I end up with such an odd error: ERROR: (gcloud.beta.ai-platform.versions.create) Bad model detected with error: "Failed to load model: Unexpected error when loading the model: 'ascii' codec can't decode byte 0xf9 in position 2: ordinal not in range(128) (Error code: 0)" The thing is that when I run the same command without the: --prediction-class predictor.MyPredictor it work fine. Does someone know the reason of this ? I think model.joblib might have an encoding problem but when I load it myself there is nothing wrong
I've find the solution, In the tutorial they use pickle to save the preprocessor object created, and Joblib to save the model. You need to use Joblib to save both and then send it to google storage.
Showing test count in buildbot
I am not particularly happy about the stats that Buildbot provides. I understand that it is for building and not testing - that's why it has a concept of Steps, but no concept of Test. Still there are many cases when you need test statistics from build results. For example when comparing skipped and failed tests on different platforms to estimate the impact of a change. So, what is needed to make Buildbot display test count in results? What is the most simple way, so that a person who don't know anything about Buildbot can do this in 15 minutes?
Depending how you want to process the test results and how the test results are presented, Buildbot does provide a Test step, buildbot.steps.shell.Test An example of how I use it for my build environment: from buildbot.steps import shell class CustomStepResult(shell.Test): description = 'Analyzing results' descriptionDone = 'Results analyzed' def __init__(self, log_file = None, *args, **kwargs): self._log_file = log_file shell.Test.__init__(self, *args, **kwargs) self.addFactoryArguments(log_file = log_file) def start(self): if not os.path.exists(self._log_file): self.finished(results.FAILURE) self.step_status.setText('TestResult XML file not found !') else: import xml.etree.ElementTree as etree tree = etree.parse(self._log_file) root = tree.getroot() passing = len(root.findall('./testsuite/testcase/success')) skipped = len(root.findall('./testsuite/testcase/skip')) fails = len(root.findall('./testsuite/error')) + len(root.findall('./testsuite/testcase/error')) + len(root.findall('./testsuite/testcase/failure')) self.setTestResults(total = fails+passing+skipped, failed = fails, passed = passing) ## the final status for WARNINGS is green but the step itself will be orange self.finished(results.SUCCESS if fails == 0 else results.WARNINGS) self.step_status.setText(self.describe(True)) And in the configuration factory I create a step as below: factory.addStep(CustomStepResult(log_file = log_file)) Basically I override the default Test shell step and pass a custom XML file which contains my test results. I then look for the pass/fail/skip result nodes and accordingly display the results in the waterfall.