// Copyright 2022 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. namespace tflite; // TFLite metadata contains both human readable and machine readable information // about what the model does and how to use the model. It can be used as a // README file, which elaborates the details of the model, each input/ouput // tensor, and each associated file. // // An important use case of TFLite metadata is the TFLite codegen tool, which // automatically generates the model interface based on the properties of the // model and the tensors. The model interface provides high-level APIs to // interact with the model, such as preprocessing the input data and running // inferences. // // Entries marked with "" are used in TFLite codegen tool to // generate the model interface. It is recommended to fill in at least those // enties to boost the codegen performance. // The Metadata schema is versioned by the Semantic versioning number, such as // MAJOR.MINOR.PATCH. It tracks the schema changes according to the rules below: // * Bump up the MAJOR number when making potentially backwards incompatible // changes. It must be incremented if the new changes break the backwards // compatibility. It may also include minor and patch level changes as // needed. The true backwards compatibility is indicated by the file // identifier. // * Bump up the MINOR number when making backwards compatible updates for // major features, such as supporting new content types or adding new // processing units. // * Bump up the PATCH number when making small backwards compatible changes, // such as adding a new fields or deprecating certain fields (not deleting // them). // // ModelMetadata.min_parser_version indicates the minimum necessary metadata // parser version to fully understand all fields in a given metadata flatbuffer. // // New fields and types will have associated comments with the schema version // for which they were added. // // Schema Semantic version: 1.5.0 // This indicates the flatbuffer compatibility. The number will bump up when a // break change is applied to the schema, such as removing fields or adding new // fields to the middle of a table. file_identifier "M001"; // History: // 1.0.1 - Added VOCABULARY type to AssociatedFileType. // 1.1.0 - Added BertTokenizerOptions to ProcessUnitOptions. // Added SentencePieceTokenizerOptions to ProcessUnitOptions. // Added input_process_units to SubGraphMetadata. // Added output_process_units to SubGraphMetadata. // 1.2.0 - Added input_tensor_group to SubGraphMetadata. // Added output_tensor_group to SubGraphMetadata. // 1.2.1 - Added RegexTokenizerOptions to ProcessUnitOptions. // 1.3.0 - Added AudioProperties to ContentProperties. // 1.4.0 - Added SCANN_INDEX_FILE type to AssociatedFileType. // 1.4.1 - Added version to AssociatedFile. // 1.5.0 - Added CustomMetadata in SubGraphMetadata. // File extension of any written files. file_extension "tflitemeta"; enum AssociatedFileType : byte { UNKNOWN = 0, // Files such as readme.txt. DESCRIPTIONS = 1, // Contains a list of labels (characters separated by "\n" or in lines) that // annotate certain axis of the tensor. For example, // the label file in image classification. Those labels annotate the // the output tensor, such that each value in the output tensor is the // probability of that corresponding category specified by the label. See the // example label file used in image classification [1]. // // : // If an output tensor has an associated file as TENSOR_AXIS_LABELS, return // the output as a mapping between the labels and probability in the model // interface. // If multiple files of the same type are present, the first one is used by // default; additional ones are to be distinguished from one another by their // specified locale. // // TODO: Add github example link. TENSOR_AXIS_LABELS = 2, // Contains a list of labels (characters separated by "\n" or in lines) that // tensor values correspond to. For example, in // the object detection model, one of the output tensors is the detected // classes. And each value in the tensor refers to the index of label in the // category label file. See the example label file used in object detection // [1]. // // : // If an output tensor has an associated file as TENSOR_VALUE_LABELS, convert // the tensor values into labels, and return a list of string as the output. // If multiple files of the same type are present, the first one is used by // default; additional ones are to be distinguished from one another by their // specified locale. // // TODO: Add github example link. TENSOR_VALUE_LABELS = 3, // Contains sigmoid-based score calibration parameters, formatted as CSV. // Lines contain for each index of an output tensor the scale, slope, offset // and (optional) min_score parameters to be used for sigmoid fitting (in this // order and in `strtof`-compatible [1] format). Scale should be a // non-negative value. // A line may be left empty to default calibrated scores for this index to // default_score. // In summary, each line should thus contain 0, 3 or 4 comma-separated values. // // See the example score calibration file used in image classification [2]. // // See documentation for ScoreCalibrationOptions for details. // // [1]: https://en.cppreference.com/w/c/string/byte/strtof // TODO: Add github example link. TENSOR_AXIS_SCORE_CALIBRATION = 4, // Contains a list of unique words (characters separated by "\n" or in lines) // that help to convert natural language words to embedding vectors. // // See the example vocab file used in text classification [1]. // // TODO: Add github example link. // Added in: 1.0.1 VOCABULARY = 5, // TODO: introduce the ScaNN index file with links once the code // is released. // Contains on-device ScaNN index file with LevelDB format. // Added in: 1.4.0 SCANN_INDEX_FILE = 6, } table AssociatedFile { // Name of this file. Need to be exact the same as the name of the actual file // packed into the TFLite model as a zip file. // // : // Locates to the actual file in the TFLite model. name:string; // A description of what the file is. description:string; // Type of the associated file. There may be special pre/post processing for // some types. For example in image classification, a label file of the output // will be used to convert object index into string. // // : // Determines how to process the corresponding tensor. type:AssociatedFileType; // An optional locale for this associated file (if applicable). It is // recommended to use an ISO 639-1 letter code (e.g. "en" for English), // optionally completed by a two letter region code (e.g. "en-US" for US // English and "en-CA" for Canadian English). // Leverage this in order to specify e.g multiple label files translated in // different languages. locale:string; // Version of the file specified by model creators. // Added in: 1.4.1 version:string; } // The basic content type for all tensors. // // : // Input feature tensors: // 1. Generates the method to load data from a TensorBuffer. // 2. Creates the preprocessing logic. The default processing pipeline is: // [NormalizeOp, QuantizeOp]. // Output feature tensors: // 1. Generates the method to return the output data to a TensorBuffer. // 2. Creates the post-processing logic. The default processing pipeline is: // [DeQuantizeOp]. table FeatureProperties { } // The type of color space of an image. enum ColorSpaceType : byte { UNKNOWN = 0, RGB = 1, GRAYSCALE = 2, } table ImageSize { width:uint; height:uint; } // The properties for image tensors. // // : // Input image tensors: // 1. Generates the method to load an image from a TensorImage. // 2. Creates the preprocessing logic. The default processing pipeline is: // [ResizeOp, NormalizeOp, QuantizeOp]. // Output image tensors: // 1. Generates the method to return the output data to a TensorImage. // 2. Creates the post-processing logic. The default processing pipeline is: // [DeQuantizeOp]. table ImageProperties { // The color space of the image. // // : // Determines how to convert the color space of a given image from users. color_space:ColorSpaceType; // Indicates the default value of image width and height if the tensor shape // is dynamic. For fixed-size tensor, this size will be consistent with the // expected size. default_size:ImageSize; } // The properties for tensors representing bounding boxes. // // : // Input image tensors: NA. // Output image tensors: parses the values into a data structure that represents // bounding boxes. For example, in the generated wrapper for Android, it returns // the output as android.graphics.Rect objects. enum BoundingBoxType : byte { UNKNOWN = 0, // Represents the bounding box by using the combination of boundaries, // {left, top, right, bottom}. // The default order is {left, top, right, bottom}. Other orders can be // indicated by BoundingBoxProperties.index. BOUNDARIES = 1, // Represents the bounding box by using the upper_left corner, width and // height. // The default order is {upper_left_x, upper_left_y, width, height}. Other // orders can be indicated by BoundingBoxProperties.index. UPPER_LEFT = 2, // Represents the bounding box by using the center of the box, width and // height. The default order is {center_x, center_y, width, height}. Other // orders can be indicated by BoundingBoxProperties.index. CENTER = 3, } // The properties for audio tensors. // Added in: 1.3.0 table AudioProperties { // The sample rate in Hz when the audio was captured. sample_rate:uint; // The channel count of the audio. channels:uint; } enum CoordinateType : byte { // The coordinates are float values from 0 to 1. RATIO = 0, // The coordinates are integers. PIXEL = 1, } table BoundingBoxProperties { // Denotes the order of the elements defined in each bounding box type. An // empty index array represent the default order of each bounding box type. // For example, to denote the default order of BOUNDARIES, {left, top, right, // bottom}, the index should be {0, 1, 2, 3}. To denote the order {left, // right, top, bottom}, the order should be {0, 2, 1, 3}. // // The index array can be applied to all bounding box types to adjust the // order of their corresponding underlying elements. // // : // Indicates how to parse the bounding box values. index:[uint]; // : // Indicates how to parse the bounding box values. type:BoundingBoxType; // : // Indicates how to convert the bounding box back to the original image in // pixels. coordinate_type:CoordinateType; } union ContentProperties { FeatureProperties, ImageProperties, BoundingBoxProperties, // Added in: 1.3.0 AudioProperties, } table ValueRange { min:int; max:int; } table Content { // The properties that the content may have, indicating the type of the // Content. // // : // Indicates how to process the tensor. content_properties:ContentProperties; // The range of dimensions that the content corresponds to. A NULL // "range" indicates that the content uses up all dimensions, // except the batch axis if applied. // // Here are all the possible situations of how a tensor is composed. // Case 1: The tensor is a single object, such as an image. // For example, the input of an image classifier // (https://www.tensorflow.org/lite/models/image_classification/overview), // a tensor of shape [1, 224, 224, 3]. Dimensions 1 to 3 correspond to the // image. Since dimension 0 is a batch axis, which can be ignored, // "range" can be left as NULL. // // Case 2: The tensor contains multiple instances of the same object. // For example, the output tensor of detected bounding boxes of an object // detection model // (https://www.tensorflow.org/lite/models/object_detection/overview). // The tensor shape is [1, 10, 4]. Here is the what the three dimensions // represent for: // dimension 0: the batch axis. // dimension 1: the 10 objects detected with the highest confidence. // dimension 2: the bounding boxes of the 10 detected objects. // The tensor is essentially 10 bounding boxes. In this case, // "range" should be {min=2; max=2;}. // // The output tensor of scores of the above object detection model has shape // [1, 10], where // dimension 0: the batch axis; // dimension 1: the scores of the 10 detected objects. // Set "range" to the number of dimensions which is {min=2; max=2;} to denote // that every element in the tensor is an individual content object, i.e. a // score in this example. // // Another example is the pose estimation model // (https://www.tensorflow.org/lite/models/pose_estimation/overview). // The output tensor of heatmaps is in the shape of [1, 9, 9, 17]. // Here is the what the four dimensions represent for: // dimension 0: the batch axis. // dimension 1/2: the heatmap image. // dimension 3: 17 body parts of a person. // Even though the last axis is body part, the real content of this tensor is // the heatmap. "range" should be [min=1; max=2]. // // Case 3: The tensor contains multiple different objects. (Not supported by // Content at this point). // Sometimes a tensor may contain multiple different objects, thus different // contents. It is very common for regression models. For example, a model // to predict the fuel efficiency // (https://www.tensorflow.org/tutorials/keras/regression). // The input tensor has shape [1, 9], consisting of 9 features, such as // "Cylinders", "Displacement", "Weight", etc. In this case, dimension 1 // contains 9 different contents. However, since these sub-dimension objects // barely need to be specifically processed, their contents are not recorded // in the metadata. Through, the name of each dimension can be set through // TensorMetadata.dimension_names. // // Note that if it is not case 3, a tensor can only have one content type. // // : // Case 1: return a processed single object of certain content type. // Case 2: return a list of processed objects of certain content type. The // generated model interface have API to random access those objects from // the output. range:ValueRange; } // Parameters that are used when normalizing the tensor. table NormalizationOptions{ // mean and std are normalization parameters. Tensor values are normalized // on a per-channel basis, by the formula // (x - mean) / std. // If there is only one value in mean or std, we'll propagate the value to // all channels. // // Quantized models share the same normalization parameters as their // corresponding float models. For example, an image input tensor may have // the normalization parameter of // mean = 127.5f and std = 127.5f. // The image value will be normalized from [0, 255] to [-1, 1]. // Then, for quantized models, the image data should be further quantized // according to the quantization parameters. In the case of uint8, the image // data will be scaled back to [0, 255], while for int8, the image data will // be scaled to [-128, 127]. // // Both the normalization parameters and quantization parameters can be // retrieved through the metadata extractor library. // TODO: add link for the metadata extractor library. // Per-channel mean of the possible values used in normalization. // // : // Apply normalization to input tensors accordingly. mean:[float]; // Per-channel standard dev. of the possible values used in normalization. // // : // Apply normalization to input tensors accordingly. std:[float]; } // The different possible score transforms to apply to uncalibrated scores // before applying score calibration. enum ScoreTransformationType : byte { // Identity function: g(x) = x. IDENTITY = 0, // Log function: g(x) = log(x). LOG = 1, // Inverse logistic function: g(x) = log(x) - log(1-x). INVERSE_LOGISTIC = 2, } // Options to perform score calibration on an output tensor through sigmoid // functions. One of the main purposes of score calibration is to make scores // across classes comparable, so that a common threshold can be used for all // output classes. This is meant for models producing class predictions as // output, e.g. image classification or detection models. // // For each index in the output tensor, this applies: // * `f(x) = scale / (1 + e^-(slope*g(x)+offset))` if `x > min_score` or if no // `min_score` has been specified, // * `f(x) = default_score` otherwise or if no scale, slope and offset have been // specified. // Where: // * scale, slope, offset and (optional) min_score are index-specific parameters // * g(x) is an index-independent transform among those defined in // ScoreTransformationType // * default_score is an index-independent parameter. // An AssociatedFile with type TANSOR_AXIS_SCORE_CALIBRATION specifying the // index-specific parameters must be associated with the corresponding // TensorMetadata for score calibration be applied. // // See the example score calibration file used in image classification [1]. // TODO: Add github example link. table ScoreCalibrationOptions { // The function to use for transforming the uncalibrated score before // applying score calibration. score_transformation:ScoreTransformationType; // The default calibrated score to apply if the uncalibrated score is // below min_score or if no parameters were specified for a given index. default_score:float; } // Performs thresholding on output tensor values, in order to filter out // low-confidence results. table ScoreThresholdingOptions { // The recommended global threshold below which results are considered // low-confidence and should be filtered out. global_score_threshold:float; } // Performs Bert tokenization as in tf.text.BertTokenizer // (https://github.com/tensorflow/text/blob/3599f6fcd2b780a2dc413b90fb9315464f10b314/docs/api_docs/python/text/BertTokenizer.md) // Added in: 1.1.0 table BertTokenizerOptions { // The vocabulary files used in the BertTokenizer. vocab_file:[AssociatedFile]; } // Performs SentencePiece tokenization as in tf.text.SentencepieceTokenizer // (https://github.com/tensorflow/text/blob/3599f6fcd2b780a2dc413b90fb9315464f10b314/docs/api_docs/python/text/SentencepieceTokenizer.md). // Added in: 1.1.0 table SentencePieceTokenizerOptions { // The SentencePiece model files used in the SentencePieceTokenizer. sentencePiece_model:[AssociatedFile]; // The optional vocabulary model files used in the SentencePieceTokenizer. vocab_file:[AssociatedFile]; } // Splits strings by the occurrences of delim_regex_pattern and converts the // tokens into ids. For example, given // delim_regex_pattern: "\W+", // string: "Words, words, words.", // the tokens after split are: "Words", "words", "words", "". // And then the tokens can be converted into ids according to the vocab_file. // Added in: 1.2.1 table RegexTokenizerOptions { delim_regex_pattern:string; // The vocabulary files used to convert this tokens into ids. vocab_file:[AssociatedFile]; } // Options that are used when processing the tensor. union ProcessUnitOptions { NormalizationOptions, ScoreCalibrationOptions, ScoreThresholdingOptions, // Added in: 1.1.0 BertTokenizerOptions, // Added in: 1.1.0 SentencePieceTokenizerOptions, // Added in: 1.2.1 RegexTokenizerOptions } // A process unit that is used to process the tensor out-of-graph. table ProcessUnit { options:ProcessUnitOptions; } // Statistics to describe a tensor. table Stats { // Max and min are not currently used in tflite.support codegen. They mainly // serve as references for users to better understand the model. They can also // be used to validate model pre/post processing results. // If there is only one value in max or min, we'll propagate the value to // all channels. // Per-channel maximum value of the tensor. max:[float]; // Per-channel minimum value of the tensor. min:[float]; } // Metadata of a group of tensors. It may contain several tensors that will be // grouped together in codegen. For example, the TFLite object detection model // example (https://www.tensorflow.org/lite/models/object_detection/overview) // has four outputs: classes, scores, bounding boxes, and number of detections. // If the four outputs are bundled together using TensorGroup (for example, // named as "detection result"), the codegen tool will generate the class, // `DetectionResult`, which contains the class, score, and bounding box. And the // outputs of the model will be converted to a list of `DetectionResults` and // the number of detection. Note that the number of detection is a single // number, therefore is inappropriate for the list of `DetectionResult`. // Added in: 1.2.0 table TensorGroup { // Name of tensor group. // // : // Name of the joint class of the tensor group. name:string; // Names of the tensors to group together, corresponding to // TensorMetadata.name. // // : // Determines which tensors will be added to this group. All tensors in the // group should have the same number of elements specified by Content.range. tensor_names:[string]; } // Detailed information of an input or output tensor. table TensorMetadata { // Name of the tensor. // // : // The name of this tensor in the generated model interface. name:string; // A description of the tensor. description:string; // A list of names of the dimensions in this tensor. The length of // dimension_names need to match the number of dimensions in this tensor. // // : // The name of each dimension in the generated model interface. See "Case 2" // in the comments of Content.range. dimension_names:[string]; // The content that represents this tensor. // // : // Determines how to process this tensor. See each item in ContentProperties // for the default process units that will be applied to the tensor. content:Content; // The process units that are used to process the tensor out-of-graph. // // : // Contains the parameters of the default processing pipeline for each content // type, such as the normalization parameters in all content types. See the // items under ContentProperties for the details of the default processing // pipeline. process_units:[ProcessUnit]; // The statistics of the tensor values. stats:Stats; // A list of associated files of this tensor. // // : // Contains processing parameters of this tensor, such as normalization. associated_files:[AssociatedFile]; } table CustomMetadata { name:string; data:[ubyte] (force_align: 16); } table SubGraphMetadata { // Name of the subgraph. // // Note that, since TFLite only support one subgraph at this moment, the // Codegen tool will use the name in ModelMetadata in the generated model // interface. name:string; // A description explains details about what the subgraph does. description:string; // Metadata of all input tensors used in this subgraph. It matches exactly // the input tensors specified by `SubGraph.inputs` in the TFLite // schema.fbs file[2]. The number of `TensorMetadata` in the array should // equal to the number of indices in `SubGraph.inputs`. // // [2]: tensorflow/lite/schema/schema.fbs // : // Determines how to process the inputs. input_tensor_metadata:[TensorMetadata]; // Metadata of all output tensors used in this subgraph. It matches exactly // the output tensors specified by `SubGraph.outputs` in the TFLite // schema.fbs file[2]. The number of `TensorMetadata` in the array should // equal to the number of indices in `SubGraph.outputs`. // // : // Determines how to process the outputs. output_tensor_metadata:[TensorMetadata]; // A list of associated files of this subgraph. associated_files:[AssociatedFile]; // Input process units of the subgraph. Some models may have complex pre and // post processing logics where the process units do not work on one tensor at // a time, but in a similar way of a TFLite graph. For example, in the // MobileBert model (https://www.tensorflow.org/lite/models/bert_qa/overview), // the inputs are: ids / mask / segment ids; // the outputs are: end logits / start logits. // The preprocessing converts the query string and the context string to the // model inputs, and the post-processing converts the model outputs to the // answer string. // Added in: 1.1.0 input_process_units:[ProcessUnit]; // Output process units of the subgraph. // Added in: 1.1.0 output_process_units:[ProcessUnit]; // Metadata of all input tensor groups used in this subgraph. // // : // Bundles the corresponding elements of the underlying input tensors together // into a class, and converts those individual tensors into a list of the // class objects. // Added in: 1.2.0 input_tensor_groups:[TensorGroup]; // Metadata of all output tensor groups used in this subgraph. // // : // Bundles the corresponding elements of the underlying output tensors // together into a class, and converts those individual tensors into a list of // the class objects. // Added in: 1.2.0 output_tensor_groups:[TensorGroup]; // A list of custom metadata. // Added in: 1.5.0. custom_metadata:[CustomMetadata]; } table ModelMetadata { // Name of the model. // // : // The name of the model in the generated model interface. name:string; // Model description in schema. description:string; // Version of the model that specified by model creators. version:string; // Noted that, the minimum required TFLite runtime version that the model is // compatible with, has already been added as a metadata entry in tflite // schema. We'll decide later if we want to move it here, and keep it with // other metadata entries. // Metadata of all the subgraphs of the model. The 0th is assumed to be the // main subgraph. // // : // Determines how to process the inputs and outputs. subgraph_metadata:[SubGraphMetadata]; // The person who creates this model. author:string; // Licenses that may apply to this model. license:string; // A list of associated files of this model. associated_files:[AssociatedFile]; // The minimum metadata parser version that can fully understand the fields in // the metadata flatbuffer. The version is effectively the largest version // number among the versions of all the fields populated and the smallest // compatible version indicated by the file identifier. // // This field is automatically populated by the MetadataPopulator when // the metadata is populated into a TFLite model. min_parser_version:string; } root_type ModelMetadata;