Model Loader API

Draft Community Group Report,

More details about this document
This version:
https://webmachinelearning.github.io/model-loader/
Issue Tracking:
GitHub
Editor:
Jonathan Bingham (Google Inc.)
Explainer:
explainer.md

Abstract

This document describes an API to load a custom pre-trained machine learning model.

Status of this document

This specification was published by the Web Machine Learning Community Group. It is not a W3C Standard nor is it on the W3C Standards Track. Please note that under the W3C Community Contributor License Agreement (CLA) there is a limited opt-out and other conditions apply. Learn more about W3C Community and Business Groups.

This incubation is on pause, see discussion for the latest updates.

1. Introduction

For the introduction and use cases, please see the explainer.md.

For illustration purposes, the API and examples use the TF Lite flatbuffer format.

2. API

enum MLModelFormat {
  // Tensorflow-lite flatbuffer.
  "tflite" 
};

enum MLDevicePreference {
  // Let the backend selects the most suitable device.
  "auto",
  // The backend will use GPU to do model inference. If some operator is not
  // supported by GPU, it will fall back to CPU.
  "gpu",
  // The backend will use CPU to do model inference.
  "cpu"
};

enum MLPowerPreference {
  // Let the backend selects the most suitable behavior.
  "auto",
  // Prioritizes execution speed over power consumption.
  "high-performance",
  // Prioritizes power consumption over other considerations such as execution
  // speed.
  "low-power",
};

dictionary MLContextOptions {
  // Preferred kind of device to use.
  MLDevicePreference devicePreference = "auto";

  // Preference as related to power consumption.
  MLPowerPreference powerPreference = "auto";

  // Model format for the model loader API.
  MLModelFormat modelFormat = "tflite";
  
  // Number of thread to use.
  // "0" means the backend can determine it automatically.
  unsigned long numThreads = 0;
};

[Exposed=Window]
interface ML {
  Promise<MLContext> createContext(optional MLContextOptions options = {});
};

enum MLDataType {
  // "Unknown" doesn’t mean "unsupported". The background can support more types
  // than which are explicitly listed here (e.g., TfLite has complex numbers).
  // We treat them as "unknown" to avoid exposing too many details of the
  // backends from the beginning.
  "unknown",
  "int64",
  "uint64",
  "float64",
  "int32",
  "uint32",
  "float32",
  "int16",
  "uint16",
  "float16",
  "int8",
  "uint8",
  "bool",
};

dictionary MLTensor {
  required ArrayBufferView data;
  required sequence<unsigned long> dimensions;
};

dictionary MLTensorInfo {
  required DOMString name;
  required MLDataType type;
  required sequence<unsigned long> dimensions;
};

[SecureContext, Exposed=Window]
interface MLModel {
  Promise<record<DOMString, MLTensor>> compute(record<DOMString, MLTensor> inputs);      
  sequence<MLTensorInfo> inputs();
  sequence<MLTensorInfo> outputs();
};

[Exposed=Window]
interface MLModelLoader {
  constructor(MLContext context);
  Promise<MLModel> load(ArrayBuffer modelBuffer);
};

3. Examples

// First, create an MLContext. This is consistent with the WebNN API. And we will 
// add two new fields, “numThread” and "modelFormat". 
const context = await navigator.ml.createContext(
                                     { devicePreference: "cpu",
                                       powerPreference: "low-power",
                                       numThread: 0,   // the default 0 means 
                                                       // "decide automatically". 
                                       modelFormat: "tflite" });
// Then create the model loader using the ML context.
loader = new MLModelLoader(context);
// In the first version, we only support loading models from ArrayBuffers. We 
// believe this covers most of the usage cases. Web developers can download the 
// model, e.g., by the fetch API. We can add new "load" functions in the future
// if they are really needed.
const modelUrl = 'https://path/to/model/file';
const modelBuffer = await fetch(modelUrl)
                            .then(response => response.arrayBuffer());
// Load the model.
model = await loader.load(modelBuffer);
// Use the `model.compute` function to get the output of the model from some 
// inputs.  Example ways of using this function includes, 
// 1. When there is only one input tensor of the model, one can simply input the 
// tensor, without specifying the name of it (the user can still designate this 
// input tensor by name if they like).
z = await model.compute({ data: new Float32Array([10]), 
                          dimensions: [1]) });
// 2. When there are multiple input tensors, the user has to designate the name 
// of the input tensors by their names.
z = await model.compute({ x: { data: new Float32Array([10]), 
                               dimensions: [1] },
                          y: { data: new Float32Array([20]), 
                               dimensions: [1] } });
// 3. The client can also specify the output tensor. This is consistent with the 
// WebNN API and can be useful, e.g., when the output tensor is a GPU buffer. At 
// this time, the function will return an empty promise. The dimension of the 
// output tensor specified must match the dimensions of the output tensor of the 
// model. 
z_buffer = ml.tensor({data: new Float64Array(1), 
                      dimensions: [1] });
await model.compute({ data: new Float32Array([10]), 
                      dimensions: [1] },
                    z_buffer);
// For the output tensor(s),
// Similar to the input arguments, if there is only one output tensor, the 
// `compute` function returns a tensor in case 1 and 2, and there is no need to 
// specify the name of the output tensor in case 3. But if there are multiple 
// output tensors, the output in case 1 and 2 will be a map from tensor name to 
// tensors, and in case 3, the output argument must be a map from tensor name to
// tensors too.
// For case 1 and 2, where the actual output data locate will depend on the 
// context: if it is CPU context, the output tensor’s buffer will be RAM buffer(s)
// and if the context is GPU context, the output tensor’s buffer will be GPU 
// buffer(s).

Conformance

Document conventions

Conformance requirements are expressed with a combination of descriptive assertions and RFC 2119 terminology. The key words “MUST”, “MUST NOT”, “REQUIRED”, “SHALL”, “SHALL NOT”, “SHOULD”, “SHOULD NOT”, “RECOMMENDED”, “MAY”, and “OPTIONAL” in the normative parts of this document are to be interpreted as described in RFC 2119. However, for readability, these words do not appear in all uppercase letters in this specification.

All of the text of this specification is normative except sections explicitly marked as non-normative, examples, and notes. [RFC2119]

Examples in this specification are introduced with the words “for example” or are set apart from the normative text with class="example", like this:

This is an example of an informative example.

Informative notes begin with the word “Note” and are set apart from the normative text with class="note", like this:

Note, this is an informative note.

Index

Terms defined by this specification

Terms defined by reference

References

Normative References

[RFC2119]
S. Bradner. Key words for use in RFCs to Indicate Requirement Levels. March 1997. Best Current Practice. URL: https://datatracker.ietf.org/doc/html/rfc2119
[WEBIDL]
Edgar Chen; Timothy Gu. Web IDL Standard. Living Standard. URL: https://webidl.spec.whatwg.org/
[WEBNN]
Ningxin Hu; Chai Chaoweeraprasit. Web Neural Network API. URL: https://webmachinelearning.github.io/webnn/

IDL Index

enum MLModelFormat {
  // Tensorflow-lite flatbuffer.
  "tflite" 
};

enum MLDevicePreference {
  // Let the backend selects the most suitable device.
  "auto",
  // The backend will use GPU to do model inference. If some operator is not
  // supported by GPU, it will fall back to CPU.
  "gpu",
  // The backend will use CPU to do model inference.
  "cpu"
};

enum MLPowerPreference {
  // Let the backend selects the most suitable behavior.
  "auto",
  // Prioritizes execution speed over power consumption.
  "high-performance",
  // Prioritizes power consumption over other considerations such as execution
  // speed.
  "low-power",
};

dictionary MLContextOptions {
  // Preferred kind of device to use.
  MLDevicePreference devicePreference = "auto";

  // Preference as related to power consumption.
  MLPowerPreference powerPreference = "auto";

  // Model format for the model loader API.
  MLModelFormat modelFormat = "tflite";
  
  // Number of thread to use.
  // "0" means the backend can determine it automatically.
  unsigned long numThreads = 0;
};

[Exposed=Window]
interface ML {
  Promise<MLContext> createContext(optional MLContextOptions options = {});
};

enum MLDataType {
  // "Unknown" doesn’t mean "unsupported". The background can support more types
  // than which are explicitly listed here (e.g., TfLite has complex numbers).
  // We treat them as "unknown" to avoid exposing too many details of the
  // backends from the beginning.
  "unknown",
  "int64",
  "uint64",
  "float64",
  "int32",
  "uint32",
  "float32",
  "int16",
  "uint16",
  "float16",
  "int8",
  "uint8",
  "bool",
};

dictionary MLTensor {
  required ArrayBufferView data;
  required sequence<unsigned long> dimensions;
};

dictionary MLTensorInfo {
  required DOMString name;
  required MLDataType type;
  required sequence<unsigned long> dimensions;
};

[SecureContext, Exposed=Window]
interface MLModel {
  Promise<record<DOMString, MLTensor>> compute(record<DOMString, MLTensor> inputs);      
  sequence<MLTensorInfo> inputs();
  sequence<MLTensorInfo> outputs();
};

[Exposed=Window]
interface MLModelLoader {
  constructor(MLContext context);
  Promise<MLModel> load(ArrayBuffer modelBuffer);
};