| // Copyright 2017 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| syntax = "proto3"; |
| |
| package google.cloud.vision.v1p1beta1; |
| |
| import "google/api/annotations.proto"; |
| import "google/cloud/vision/v1p1beta1/geometry.proto"; |
| import "google/cloud/vision/v1p1beta1/text_annotation.proto"; |
| import "google/cloud/vision/v1p1beta1/web_detection.proto"; |
| import "google/rpc/status.proto"; |
| import "google/type/color.proto"; |
| import "google/type/latlng.proto"; |
| |
| option cc_enable_arenas = true; |
| option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1p1beta1;vision"; |
| option java_multiple_files = true; |
| option java_outer_classname = "ImageAnnotatorProto"; |
| option java_package = "com.google.cloud.vision.v1p1beta1"; |
| |
| // Service that performs Google Cloud Vision API detection tasks over client |
| // images, such as face, landmark, logo, label, and text detection. The |
| // ImageAnnotator service returns detected entities from the images. |
| service ImageAnnotator { |
| // Run image detection and annotation for a batch of images. |
| rpc BatchAnnotateImages(BatchAnnotateImagesRequest) |
| returns (BatchAnnotateImagesResponse) { |
| option (google.api.http) = { |
| post: "/v1p1beta1/images:annotate" |
| body: "*" |
| }; |
| } |
| } |
| |
| // Users describe the type of Google Cloud Vision API tasks to perform over |
| // images by using *Feature*s. Each Feature indicates a type of image |
| // detection task to perform. Features encode the Cloud Vision API |
| // vertical to operate on and the number of top-scoring results to return. |
| message Feature { |
| // Type of image feature. |
| enum Type { |
| // Unspecified feature type. |
| TYPE_UNSPECIFIED = 0; |
| |
| // Run face detection. |
| FACE_DETECTION = 1; |
| |
| // Run landmark detection. |
| LANDMARK_DETECTION = 2; |
| |
| // Run logo detection. |
| LOGO_DETECTION = 3; |
| |
| // Run label detection. |
| LABEL_DETECTION = 4; |
| |
| // Run OCR. |
| TEXT_DETECTION = 5; |
| |
| // Run dense text document OCR. Takes precedence when both |
| // DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present. |
| DOCUMENT_TEXT_DETECTION = 11; |
| |
| // Run computer vision models to compute image safe-search properties. |
| SAFE_SEARCH_DETECTION = 6; |
| |
| // Compute a set of image properties, such as the image's dominant colors. |
| IMAGE_PROPERTIES = 7; |
| |
| // Run crop hints. |
| CROP_HINTS = 9; |
| |
| // Run web detection. |
| WEB_DETECTION = 10; |
| } |
| |
| // The feature type. |
| Type type = 1; |
| |
| // Maximum number of results of this type. |
| int32 max_results = 2; |
| |
| // Model to use for the feature. |
| // Supported values: "builtin/stable" (the default if unset) and |
| // "builtin/latest". |
| string model = 3; |
| } |
| |
| // External image source (Google Cloud Storage image location). |
| message ImageSource { |
| // NOTE: For new code `image_uri` below is preferred. |
| // Google Cloud Storage image URI, which must be in the following form: |
| // `gs://bucket_name/object_name` (for details, see |
| // [Google Cloud Storage Request |
| // URIs](https://cloud.google.com/storage/docs/reference-uris)). |
| // NOTE: Cloud Storage object versioning is not supported. |
| string gcs_image_uri = 1; |
| |
| // Image URI which supports: |
| // 1) Google Cloud Storage image URI, which must be in the following form: |
| // `gs://bucket_name/object_name` (for details, see |
| // [Google Cloud Storage Request |
| // URIs](https://cloud.google.com/storage/docs/reference-uris)). |
| // NOTE: Cloud Storage object versioning is not supported. |
| // 2) Publicly accessible image HTTP/HTTPS URL. |
| // This is preferred over the legacy `gcs_image_uri` above. When both |
| // `gcs_image_uri` and `image_uri` are specified, `image_uri` takes |
| // precedence. |
| string image_uri = 2; |
| } |
| |
| // Client image to perform Google Cloud Vision API tasks over. |
| message Image { |
| // Image content, represented as a stream of bytes. |
| // Note: as with all `bytes` fields, protobuffers use a pure binary |
| // representation, whereas JSON representations use base64. |
| bytes content = 1; |
| |
| // Google Cloud Storage image location. If both `content` and `source` |
| // are provided for an image, `content` takes precedence and is |
| // used to perform the image annotation request. |
| ImageSource source = 2; |
| } |
| |
| // A face annotation object contains the results of face detection. |
| message FaceAnnotation { |
| // A face-specific landmark (for example, a face feature). |
| message Landmark { |
| // Face landmark (feature) type. |
| // Left and right are defined from the vantage of the viewer of the image |
| // without considering mirror projections typical of photos. So, `LEFT_EYE`, |
| // typically, is the person's right eye. |
| enum Type { |
| // Unknown face landmark detected. Should not be filled. |
| UNKNOWN_LANDMARK = 0; |
| |
| // Left eye. |
| LEFT_EYE = 1; |
| |
| // Right eye. |
| RIGHT_EYE = 2; |
| |
| // Left of left eyebrow. |
| LEFT_OF_LEFT_EYEBROW = 3; |
| |
| // Right of left eyebrow. |
| RIGHT_OF_LEFT_EYEBROW = 4; |
| |
| // Left of right eyebrow. |
| LEFT_OF_RIGHT_EYEBROW = 5; |
| |
| // Right of right eyebrow. |
| RIGHT_OF_RIGHT_EYEBROW = 6; |
| |
| // Midpoint between eyes. |
| MIDPOINT_BETWEEN_EYES = 7; |
| |
| // Nose tip. |
| NOSE_TIP = 8; |
| |
| // Upper lip. |
| UPPER_LIP = 9; |
| |
| // Lower lip. |
| LOWER_LIP = 10; |
| |
| // Mouth left. |
| MOUTH_LEFT = 11; |
| |
| // Mouth right. |
| MOUTH_RIGHT = 12; |
| |
| // Mouth center. |
| MOUTH_CENTER = 13; |
| |
| // Nose, bottom right. |
| NOSE_BOTTOM_RIGHT = 14; |
| |
| // Nose, bottom left. |
| NOSE_BOTTOM_LEFT = 15; |
| |
| // Nose, bottom center. |
| NOSE_BOTTOM_CENTER = 16; |
| |
| // Left eye, top boundary. |
| LEFT_EYE_TOP_BOUNDARY = 17; |
| |
| // Left eye, right corner. |
| LEFT_EYE_RIGHT_CORNER = 18; |
| |
| // Left eye, bottom boundary. |
| LEFT_EYE_BOTTOM_BOUNDARY = 19; |
| |
| // Left eye, left corner. |
| LEFT_EYE_LEFT_CORNER = 20; |
| |
| // Right eye, top boundary. |
| RIGHT_EYE_TOP_BOUNDARY = 21; |
| |
| // Right eye, right corner. |
| RIGHT_EYE_RIGHT_CORNER = 22; |
| |
| // Right eye, bottom boundary. |
| RIGHT_EYE_BOTTOM_BOUNDARY = 23; |
| |
| // Right eye, left corner. |
| RIGHT_EYE_LEFT_CORNER = 24; |
| |
| // Left eyebrow, upper midpoint. |
| LEFT_EYEBROW_UPPER_MIDPOINT = 25; |
| |
| // Right eyebrow, upper midpoint. |
| RIGHT_EYEBROW_UPPER_MIDPOINT = 26; |
| |
| // Left ear tragion. |
| LEFT_EAR_TRAGION = 27; |
| |
| // Right ear tragion. |
| RIGHT_EAR_TRAGION = 28; |
| |
| // Left eye pupil. |
| LEFT_EYE_PUPIL = 29; |
| |
| // Right eye pupil. |
| RIGHT_EYE_PUPIL = 30; |
| |
| // Forehead glabella. |
| FOREHEAD_GLABELLA = 31; |
| |
| // Chin gnathion. |
| CHIN_GNATHION = 32; |
| |
| // Chin left gonion. |
| CHIN_LEFT_GONION = 33; |
| |
| // Chin right gonion. |
| CHIN_RIGHT_GONION = 34; |
| } |
| |
| // Face landmark type. |
| Type type = 3; |
| |
| // Face landmark position. |
| Position position = 4; |
| } |
| |
| // The bounding polygon around the face. The coordinates of the bounding box |
| // are in the original image's scale, as returned in `ImageParams`. |
| // The bounding box is computed to "frame" the face in accordance with human |
| // expectations. It is based on the landmarker results. |
| // Note that one or more x and/or y coordinates may not be generated in the |
| // `BoundingPoly` (the polygon will be unbounded) if only a partial face |
| // appears in the image to be annotated. |
| BoundingPoly bounding_poly = 1; |
| |
| // The `fd_bounding_poly` bounding polygon is tighter than the |
| // `boundingPoly`, and encloses only the skin part of the face. Typically, it |
| // is used to eliminate the face from any image analysis that detects the |
| // "amount of skin" visible in an image. It is not based on the |
| // landmarker results, only on the initial face detection, hence |
| // the <code>fd</code> (face detection) prefix. |
| BoundingPoly fd_bounding_poly = 2; |
| |
| // Detected face landmarks. |
| repeated Landmark landmarks = 3; |
| |
| // Roll angle, which indicates the amount of clockwise/anti-clockwise rotation |
| // of the face relative to the image vertical about the axis perpendicular to |
| // the face. Range [-180,180]. |
| float roll_angle = 4; |
| |
| // Yaw angle, which indicates the leftward/rightward angle that the face is |
| // pointing relative to the vertical plane perpendicular to the image. Range |
| // [-180,180]. |
| float pan_angle = 5; |
| |
| // Pitch angle, which indicates the upwards/downwards angle that the face is |
| // pointing relative to the image's horizontal plane. Range [-180,180]. |
| float tilt_angle = 6; |
| |
| // Detection confidence. Range [0, 1]. |
| float detection_confidence = 7; |
| |
| // Face landmarking confidence. Range [0, 1]. |
| float landmarking_confidence = 8; |
| |
| // Joy likelihood. |
| Likelihood joy_likelihood = 9; |
| |
| // Sorrow likelihood. |
| Likelihood sorrow_likelihood = 10; |
| |
| // Anger likelihood. |
| Likelihood anger_likelihood = 11; |
| |
| // Surprise likelihood. |
| Likelihood surprise_likelihood = 12; |
| |
| // Under-exposed likelihood. |
| Likelihood under_exposed_likelihood = 13; |
| |
| // Blurred likelihood. |
| Likelihood blurred_likelihood = 14; |
| |
| // Headwear likelihood. |
| Likelihood headwear_likelihood = 15; |
| } |
| |
| // Detected entity location information. |
| message LocationInfo { |
| // lat/long location coordinates. |
| google.type.LatLng lat_lng = 1; |
| } |
| |
| // A `Property` consists of a user-supplied name/value pair. |
| message Property { |
| // Name of the property. |
| string name = 1; |
| |
| // Value of the property. |
| string value = 2; |
| |
| // Value of numeric properties. |
| uint64 uint64_value = 3; |
| } |
| |
| // Set of detected entity features. |
| message EntityAnnotation { |
| // Opaque entity ID. Some IDs may be available in |
| // [Google Knowledge Graph Search API](https://developers.google.com/knowledge-graph/). |
| string mid = 1; |
| |
| // The language code for the locale in which the entity textual |
| // `description` is expressed. |
| string locale = 2; |
| |
| // Entity textual description, expressed in its `locale` language. |
| string description = 3; |
| |
| // Overall score of the result. Range [0, 1]. |
| float score = 4; |
| |
| // The accuracy of the entity detection in an image. |
| // For example, for an image in which the "Eiffel Tower" entity is detected, |
| // this field represents the confidence that there is a tower in the query |
| // image. Range [0, 1]. |
| float confidence = 5; |
| |
| // The relevancy of the ICA (Image Content Annotation) label to the |
| // image. For example, the relevancy of "tower" is likely higher to an image |
| // containing the detected "Eiffel Tower" than to an image containing a |
| // detected distant towering building, even though the confidence that |
| // there is a tower in each image may be the same. Range [0, 1]. |
| float topicality = 6; |
| |
| // Image region to which this entity belongs. Not produced |
| // for `LABEL_DETECTION` features. |
| BoundingPoly bounding_poly = 7; |
| |
| // The location information for the detected entity. Multiple |
| // `LocationInfo` elements can be present because one location may |
| // indicate the location of the scene in the image, and another location |
| // may indicate the location of the place where the image was taken. |
| // Location information is usually present for landmarks. |
| repeated LocationInfo locations = 8; |
| |
| // Some entities may have optional user-supplied `Property` (name/value) |
| // fields, such a score or string that qualifies the entity. |
| repeated Property properties = 9; |
| } |
| |
| // Set of features pertaining to the image, computed by computer vision |
| // methods over safe-search verticals (for example, adult, spoof, medical, |
| // violence). |
| message SafeSearchAnnotation { |
| // Represents the adult content likelihood for the image. Adult content may |
| // contain elements such as nudity, pornographic images or cartoons, or |
| // sexual activities. |
| Likelihood adult = 1; |
| |
| // Spoof likelihood. The likelihood that an modification |
| // was made to the image's canonical version to make it appear |
| // funny or offensive. |
| Likelihood spoof = 2; |
| |
| // Likelihood that this is a medical image. |
| Likelihood medical = 3; |
| |
| // Likelihood that this image contains violent content. |
| Likelihood violence = 4; |
| |
| // Likelihood that the request image contains racy content. Racy content may |
| // include (but is not limited to) skimpy or sheer clothing, strategically |
| // covered nudity, lewd or provocative poses, or close-ups of sensitive |
| // body areas. |
| Likelihood racy = 9; |
| } |
| |
| // Rectangle determined by min and max `LatLng` pairs. |
| message LatLongRect { |
| // Min lat/long pair. |
| google.type.LatLng min_lat_lng = 1; |
| |
| // Max lat/long pair. |
| google.type.LatLng max_lat_lng = 2; |
| } |
| |
| // Color information consists of RGB channels, score, and the fraction of |
| // the image that the color occupies in the image. |
| message ColorInfo { |
| // RGB components of the color. |
| google.type.Color color = 1; |
| |
| // Image-specific score for this color. Value in range [0, 1]. |
| float score = 2; |
| |
| // The fraction of pixels the color occupies in the image. |
| // Value in range [0, 1]. |
| float pixel_fraction = 3; |
| } |
| |
| // Set of dominant colors and their corresponding scores. |
| message DominantColorsAnnotation { |
| // RGB color values with their score and pixel fraction. |
| repeated ColorInfo colors = 1; |
| } |
| |
| // Stores image properties, such as dominant colors. |
| message ImageProperties { |
| // If present, dominant colors completed successfully. |
| DominantColorsAnnotation dominant_colors = 1; |
| } |
| |
| // Single crop hint that is used to generate a new crop when serving an image. |
| message CropHint { |
| // The bounding polygon for the crop region. The coordinates of the bounding |
| // box are in the original image's scale, as returned in `ImageParams`. |
| BoundingPoly bounding_poly = 1; |
| |
| // Confidence of this being a salient region. Range [0, 1]. |
| float confidence = 2; |
| |
| // Fraction of importance of this salient region with respect to the original |
| // image. |
| float importance_fraction = 3; |
| } |
| |
| // Set of crop hints that are used to generate new crops when serving images. |
| message CropHintsAnnotation { |
| // Crop hint results. |
| repeated CropHint crop_hints = 1; |
| } |
| |
| // Parameters for crop hints annotation request. |
| message CropHintsParams { |
| // Aspect ratios in floats, representing the ratio of the width to the height |
| // of the image. For example, if the desired aspect ratio is 4/3, the |
| // corresponding float value should be 1.33333. If not specified, the |
| // best possible crop is returned. The number of provided aspect ratios is |
| // limited to a maximum of 16; any aspect ratios provided after the 16th are |
| // ignored. |
| repeated float aspect_ratios = 1; |
| } |
| |
| // Parameters for web detection request. |
| message WebDetectionParams { |
| // Whether to include results derived from the geo information in the image. |
| bool include_geo_results = 2; |
| } |
| |
| // Image context and/or feature-specific parameters. |
| message ImageContext { |
| // lat/long rectangle that specifies the location of the image. |
| LatLongRect lat_long_rect = 1; |
| |
| // List of languages to use for TEXT_DETECTION. In most cases, an empty value |
| // yields the best results since it enables automatic language detection. For |
| // languages based on the Latin alphabet, setting `language_hints` is not |
| // needed. In rare cases, when the language of the text in the image is known, |
| // setting a hint will help get better results (although it will be a |
| // significant hindrance if the hint is wrong). Text detection returns an |
| // error if one or more of the specified languages is not one of the |
| // [supported languages](/vision/docs/languages). |
| repeated string language_hints = 2; |
| |
| // Parameters for crop hints annotation request. |
| CropHintsParams crop_hints_params = 4; |
| |
| // Parameters for web detection. |
| WebDetectionParams web_detection_params = 6; |
| } |
| |
| // Request for performing Google Cloud Vision API tasks over a user-provided |
| // image, with user-requested features. |
| message AnnotateImageRequest { |
| // The image to be processed. |
| Image image = 1; |
| |
| // Requested features. |
| repeated Feature features = 2; |
| |
| // Additional context that may accompany the image. |
| ImageContext image_context = 3; |
| } |
| |
| // Response to an image annotation request. |
| message AnnotateImageResponse { |
| // If present, face detection has completed successfully. |
| repeated FaceAnnotation face_annotations = 1; |
| |
| // If present, landmark detection has completed successfully. |
| repeated EntityAnnotation landmark_annotations = 2; |
| |
| // If present, logo detection has completed successfully. |
| repeated EntityAnnotation logo_annotations = 3; |
| |
| // If present, label detection has completed successfully. |
| repeated EntityAnnotation label_annotations = 4; |
| |
| // If present, text (OCR) detection has completed successfully. |
| repeated EntityAnnotation text_annotations = 5; |
| |
| // If present, text (OCR) detection or document (OCR) text detection has |
| // completed successfully. |
| // This annotation provides the structural hierarchy for the OCR detected |
| // text. |
| TextAnnotation full_text_annotation = 12; |
| |
| // If present, safe-search annotation has completed successfully. |
| SafeSearchAnnotation safe_search_annotation = 6; |
| |
| // If present, image properties were extracted successfully. |
| ImageProperties image_properties_annotation = 8; |
| |
| // If present, crop hints have completed successfully. |
| CropHintsAnnotation crop_hints_annotation = 11; |
| |
| // If present, web detection has completed successfully. |
| WebDetection web_detection = 13; |
| |
| // If set, represents the error message for the operation. |
| // Note that filled-in image annotations are guaranteed to be |
| // correct, even when `error` is set. |
| google.rpc.Status error = 9; |
| } |
| |
| // Multiple image annotation requests are batched into a single service call. |
| message BatchAnnotateImagesRequest { |
| // Individual image annotation requests for this batch. |
| repeated AnnotateImageRequest requests = 1; |
| } |
| |
| // Response to a batch image annotation request. |
| message BatchAnnotateImagesResponse { |
| // Individual responses to image annotation requests within the batch. |
| repeated AnnotateImageResponse responses = 1; |
| } |
| |
| // A bucketized representation of likelihood, which is intended to give clients |
| // highly stable results across model upgrades. |
| enum Likelihood { |
| // Unknown likelihood. |
| UNKNOWN = 0; |
| |
| // It is very unlikely that the image belongs to the specified vertical. |
| VERY_UNLIKELY = 1; |
| |
| // It is unlikely that the image belongs to the specified vertical. |
| UNLIKELY = 2; |
| |
| // It is possible that the image belongs to the specified vertical. |
| POSSIBLE = 3; |
| |
| // It is likely that the image belongs to the specified vertical. |
| LIKELY = 4; |
| |
| // It is very likely that the image belongs to the specified vertical. |
| VERY_LIKELY = 5; |
| } |