blob: f38c0f836870405dcebd49e6217b5979e9c35ef1 [file] [log] [blame]
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vision
import (
"image"
"golang.org/x/text/language"
pb "google.golang.org/genproto/googleapis/cloud/vision/v1"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
)
// Annotations contains all the annotations performed by the API on a single image.
// A nil field indicates either that the corresponding feature was not requested,
// or that annotation failed for that feature.
type Annotations struct {
// Faces holds the results of face detection.
Faces []*FaceAnnotation
// Landmarks holds the results of landmark detection.
Landmarks []*EntityAnnotation
// Logos holds the results of logo detection.
Logos []*EntityAnnotation
// Labels holds the results of label detection.
Labels []*EntityAnnotation
// Texts holds the results of text detection.
Texts []*EntityAnnotation
// FullText holds the results of full text (OCR) detection.
FullText *TextAnnotation
// SafeSearch holds the results of safe-search detection.
SafeSearch *SafeSearchAnnotation
// ImageProps contains properties of the annotated image.
ImageProps *ImageProps
// Web contains web annotations for the image.
Web *WebDetection
// CropHints contains crop hints for the image.
CropHints []*CropHint
// If non-nil, then one or more of the attempted annotations failed.
// Non-nil annotations are guaranteed to be correct, even if Error is
// non-nil.
Error error
}
func annotationsFromProto(res *pb.AnnotateImageResponse) *Annotations {
as := &Annotations{}
for _, a := range res.FaceAnnotations {
as.Faces = append(as.Faces, faceAnnotationFromProto(a))
}
for _, a := range res.LandmarkAnnotations {
as.Landmarks = append(as.Landmarks, entityAnnotationFromProto(a))
}
for _, a := range res.LogoAnnotations {
as.Logos = append(as.Logos, entityAnnotationFromProto(a))
}
for _, a := range res.LabelAnnotations {
as.Labels = append(as.Labels, entityAnnotationFromProto(a))
}
for _, a := range res.TextAnnotations {
as.Texts = append(as.Texts, entityAnnotationFromProto(a))
}
as.FullText = textAnnotationFromProto(res.FullTextAnnotation)
as.SafeSearch = safeSearchAnnotationFromProto(res.SafeSearchAnnotation)
as.ImageProps = imagePropertiesFromProto(res.ImagePropertiesAnnotation)
as.Web = webDetectionFromProto(res.WebDetection)
as.CropHints = cropHintsFromProto(res.CropHintsAnnotation)
if res.Error != nil {
// res.Error is a google.rpc.Status. Convert to a Go error. Use a gRPC
// error because it preserves the code as a separate field.
// TODO(jba): preserve the details field.
as.Error = grpc.Errorf(codes.Code(res.Error.Code), "%s", res.Error.Message)
}
return as
}
// A FaceAnnotation describes the results of face detection on an image.
type FaceAnnotation struct {
// BoundingPoly is the bounding polygon around the face. The coordinates of
// the bounding box are in the original image's scale, as returned in
// ImageParams. The bounding box is computed to "frame" the face in
// accordance with human expectations. It is based on the landmarker
// results. Note that one or more x and/or y coordinates may not be
// generated in the BoundingPoly (the polygon will be unbounded) if only a
// partial face appears in the image to be annotated.
BoundingPoly []image.Point
// FDBoundingPoly is tighter than BoundingPoly, and
// encloses only the skin part of the face. Typically, it is used to
// eliminate the face from any image analysis that detects the "amount of
// skin" visible in an image. It is not based on the landmarker results, only
// on the initial face detection, hence the fd (face detection) prefix.
FDBoundingPoly []image.Point
// Landmarks are detected face landmarks.
Face FaceLandmarks
// RollAngle indicates the amount of clockwise/anti-clockwise rotation of
// the face relative to the image vertical, about the axis perpendicular to
// the face. Range [-180,180].
RollAngle float32
// PanAngle is the yaw angle: the leftward/rightward angle that the face is
// pointing, relative to the vertical plane perpendicular to the image. Range
// [-180,180].
PanAngle float32
// TiltAngle is the pitch angle: the upwards/downwards angle that the face is
// pointing relative to the image's horizontal plane. Range [-180,180].
TiltAngle float32
// DetectionConfidence is the detection confidence. The range is [0, 1].
DetectionConfidence float32
// LandmarkingConfidence is the face landmarking confidence. The range is [0, 1].
LandmarkingConfidence float32
// Likelihoods expresses the likelihood of various aspects of the face.
Likelihoods *FaceLikelihoods
}
func faceAnnotationFromProto(pfa *pb.FaceAnnotation) *FaceAnnotation {
fa := &FaceAnnotation{
BoundingPoly: boundingPolyFromProto(pfa.BoundingPoly),
FDBoundingPoly: boundingPolyFromProto(pfa.FdBoundingPoly),
RollAngle: pfa.RollAngle,
PanAngle: pfa.PanAngle,
TiltAngle: pfa.TiltAngle,
DetectionConfidence: pfa.DetectionConfidence,
LandmarkingConfidence: pfa.LandmarkingConfidence,
Likelihoods: &FaceLikelihoods{
Joy: Likelihood(pfa.JoyLikelihood),
Sorrow: Likelihood(pfa.SorrowLikelihood),
Anger: Likelihood(pfa.AngerLikelihood),
Surprise: Likelihood(pfa.SurpriseLikelihood),
UnderExposed: Likelihood(pfa.UnderExposedLikelihood),
Blurred: Likelihood(pfa.BlurredLikelihood),
Headwear: Likelihood(pfa.HeadwearLikelihood),
},
}
populateFaceLandmarks(pfa.Landmarks, &fa.Face)
return fa
}
// An EntityAnnotation describes the results of a landmark, label, logo or text
// detection on an image.
type EntityAnnotation struct {
// ID is an opaque entity ID. Some IDs might be available in Knowledge Graph(KG).
// For more details on KG please see:
// https://developers.google.com/knowledge-graph/
ID string
// Locale is the language code for the locale in which the entity textual
// description (next field) is expressed.
Locale string
// Description is the entity textual description, expressed in the language of Locale.
Description string
// Score is the overall score of the result. Range [0, 1].
Score float32
// Confidence is the accuracy of the entity detection in an image.
// For example, for an image containing the Eiffel Tower, this field represents
// the confidence that there is a tower in the query image. Range [0, 1].
Confidence float32
// Topicality is the relevancy of the ICA (Image Content Annotation) label to the
// image. For example, the relevancy of 'tower' to an image containing
// 'Eiffel Tower' is likely higher than an image containing a distant towering
// building, though the confidence that there is a tower may be the same.
// Range [0, 1].
Topicality float32
// BoundingPoly is the image region to which this entity belongs. Not filled currently
// for label detection. For text detection, BoundingPolys
// are produced for the entire text detected in an image region, followed by
// BoundingPolys for each word within the detected text.
BoundingPoly []image.Point
// Locations contains the location information for the detected entity.
// Multiple LatLng structs can be present since one location may indicate the
// location of the scene in the query image, and another the location of the
// place where the query image was taken. Location information is usually
// present for landmarks.
Locations []LatLng
// Properties are additional optional Property fields.
// For example a different kind of score or string that qualifies the entity.
Properties []Property
}
func entityAnnotationFromProto(e *pb.EntityAnnotation) *EntityAnnotation {
var locs []LatLng
for _, li := range e.Locations {
locs = append(locs, latLngFromProto(li.LatLng))
}
var props []Property
for _, p := range e.Properties {
props = append(props, propertyFromProto(p))
}
return &EntityAnnotation{
ID: e.Mid,
Locale: e.Locale,
Description: e.Description,
Score: e.Score,
Confidence: e.Confidence,
Topicality: e.Topicality,
BoundingPoly: boundingPolyFromProto(e.BoundingPoly),
Locations: locs,
Properties: props,
}
}
// TextAnnotation contains a structured representation of OCR extracted text.
// The hierarchy of an OCR extracted text structure looks like:
// TextAnnotation -> Page -> Block -> Paragraph -> Word -> Symbol
// Each structural component, starting from Page, may further have its own
// properties. Properties describe detected languages, breaks etc.
type TextAnnotation struct {
// List of pages detected by OCR.
Pages []*Page
// UTF-8 text detected on the pages.
Text string
}
func textAnnotationFromProto(pta *pb.TextAnnotation) *TextAnnotation {
if pta == nil {
return nil
}
var pages []*Page
for _, p := range pta.Pages {
pages = append(pages, pageFromProto(p))
}
return &TextAnnotation{
Pages: pages,
Text: pta.Text,
}
}
// A Page is a page of text detected from OCR.
type Page struct {
// Additional information detected on the page.
Properties *TextProperties
// Page width in pixels.
Width int32
// Page height in pixels.
Height int32
// List of blocks of text, images etc on this page.
Blocks []*Block
}
func pageFromProto(p *pb.Page) *Page {
if p == nil {
return nil
}
var blocks []*Block
for _, b := range p.Blocks {
blocks = append(blocks, blockFromProto(b))
}
return &Page{
Properties: textPropertiesFromProto(p.Property),
Width: p.Width,
Height: p.Height,
Blocks: blocks,
}
}
// A Block is a logical element on the page.
type Block struct {
// Additional information detected for the block.
Properties *TextProperties
// The bounding box for the block.
// The vertices are in the order of top-left, top-right, bottom-right,
// bottom-left. When a rotation of the bounding box is detected the rotation
// is represented as around the top-left corner as defined when the text is
// read in the 'natural' orientation.
// For example:
// * when the text is horizontal it might look like:
// 0----1
// | |
// 3----2
// * when it's rotated 180 degrees around the top-left corner it becomes:
// 2----3
// | |
// 1----0
// and the vertice order will still be (0, 1, 2, 3).
BoundingBox []image.Point
// List of paragraphs in this block (if this blocks is of type text).
Paragraphs []*Paragraph
// Detected block type (text, image etc) for this block.
BlockType BlockType
}
// A BlockType represents the kind of Block (text, image, etc.)
type BlockType int
const (
// Unknown block type.
UnknownBlock BlockType = BlockType(pb.Block_UNKNOWN)
// Regular text block.
TextBlock BlockType = BlockType(pb.Block_TEXT)
// Table block.
TableBlock BlockType = BlockType(pb.Block_TABLE)
// Image block.
PictureBlock BlockType = BlockType(pb.Block_PICTURE)
// Horizontal/vertical line box.
RulerBlock BlockType = BlockType(pb.Block_RULER)
// Barcode block.
BarcodeBlock BlockType = BlockType(pb.Block_BARCODE)
)
func blockFromProto(p *pb.Block) *Block {
if p == nil {
return nil
}
var paras []*Paragraph
for _, pa := range p.Paragraphs {
paras = append(paras, paragraphFromProto(pa))
}
return &Block{
Properties: textPropertiesFromProto(p.Property),
BoundingBox: boundingPolyFromProto(p.BoundingBox),
Paragraphs: paras,
BlockType: BlockType(p.BlockType),
}
}
// A Paragraph is a structural unit of text representing a number of words in
// certain order.
type Paragraph struct {
// Additional information detected for the paragraph.
Properties *TextProperties
// The bounding box for the paragraph.
// The vertices are in the order of top-left, top-right, bottom-right,
// bottom-left. When a rotation of the bounding box is detected the rotation
// is represented as around the top-left corner as defined when the text is
// read in the 'natural' orientation.
// For example:
// * when the text is horizontal it might look like:
// 0----1
// | |
// 3----2
// * when it's rotated 180 degrees around the top-left corner it becomes:
// 2----3
// | |
// 1----0
// and the vertice order will still be (0, 1, 2, 3).
BoundingBox []image.Point
// List of words in this paragraph.
Words []*Word
}
func paragraphFromProto(p *pb.Paragraph) *Paragraph {
if p == nil {
return nil
}
var words []*Word
for _, w := range p.Words {
words = append(words, wordFromProto(w))
}
return &Paragraph{
Properties: textPropertiesFromProto(p.Property),
BoundingBox: boundingPolyFromProto(p.BoundingBox),
Words: words,
}
}
// A Word is a word in a text document.
type Word struct {
// Additional information detected for the word.
Properties *TextProperties
// The bounding box for the word.
// The vertices are in the order of top-left, top-right, bottom-right,
// bottom-left. When a rotation of the bounding box is detected the rotation
// is represented as around the top-left corner as defined when the text is
// read in the 'natural' orientation.
// For example:
// * when the text is horizontal it might look like:
// 0----1
// | |
// 3----2
// * when it's rotated 180 degrees around the top-left corner it becomes:
// 2----3
// | |
// 1----0
// and the vertice order will still be (0, 1, 2, 3).
BoundingBox []image.Point
// List of symbols in the word.
// The order of the symbols follows the natural reading order.
Symbols []*Symbol
}
func wordFromProto(p *pb.Word) *Word {
if p == nil {
return nil
}
var syms []*Symbol
for _, s := range p.Symbols {
syms = append(syms, symbolFromProto(s))
}
return &Word{
Properties: textPropertiesFromProto(p.Property),
BoundingBox: boundingPolyFromProto(p.BoundingBox),
Symbols: syms,
}
}
// A Symbol is a symbol in a text document.
type Symbol struct {
// Additional information detected for the symbol.
Properties *TextProperties
// The bounding box for the symbol.
// The vertices are in the order of top-left, top-right, bottom-right,
// bottom-left. When a rotation of the bounding box is detected the rotation
// is represented as around the top-left corner as defined when the text is
// read in the 'natural' orientation.
// For example:
// * when the text is horizontal it might look like:
// 0----1
// | |
// 3----2
// * when it's rotated 180 degrees around the top-left corner it becomes:
// 2----3
// | |
// 1----0
// and the vertice order will still be (0, 1, 2, 3).
BoundingBox []image.Point
// The actual UTF-8 representation of the symbol.
Text string
}
func symbolFromProto(p *pb.Symbol) *Symbol {
if p == nil {
return nil
}
return &Symbol{
Properties: textPropertiesFromProto(p.Property),
BoundingBox: boundingPolyFromProto(p.BoundingBox),
Text: p.Text,
}
}
// TextProperties contains additional information about an OCR structural component.
type TextProperties struct {
// A list of detected languages together with confidence.
DetectedLanguages []*DetectedLanguage
// Detected start or end of a text segment.
DetectedBreak *DetectedBreak
}
// Detected language for a structural component.
type DetectedLanguage struct {
// The BCP-47 language code, such as "en-US" or "sr-Latn".
Code language.Tag
// The confidence of the detected language, in the range [0, 1].
Confidence float32
}
// DetectedBreak is the detected start or end of a structural component.
type DetectedBreak struct {
// The type of break.
Type DetectedBreakType
// True if break prepends the element.
IsPrefix bool
}
type DetectedBreakType int
const (
// Unknown break label type.
UnknownBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_UNKNOWN)
// Regular space.
SpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SPACE)
// Sure space (very wide).
SureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SURE_SPACE)
// Line-wrapping break.
EOLSureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_EOL_SURE_SPACE)
// End-line hyphen that is not present in text; does not co-occur with SPACE, LEADER_SPACE, or LINE_BREAK.
HyphenBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_HYPHEN)
// Line break that ends a paragraph.
LineBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_LINE_BREAK)
)
func textPropertiesFromProto(p *pb.TextAnnotation_TextProperty) *TextProperties {
var dls []*DetectedLanguage
for _, dl := range p.DetectedLanguages {
tag, _ := language.Parse(dl.LanguageCode)
// Ignore error. If err != nil the returned tag will not be garbage,
// but a best-effort attempt at a parse. At worst it will be
// language.Und, the documented "undefined" Tag.
dls = append(dls, &DetectedLanguage{Code: tag, Confidence: dl.Confidence})
}
var db *DetectedBreak
if p.DetectedBreak != nil {
db = &DetectedBreak{
Type: DetectedBreakType(p.DetectedBreak.Type),
IsPrefix: p.DetectedBreak.IsPrefix,
}
}
return &TextProperties{
DetectedLanguages: dls,
DetectedBreak: db,
}
}
// SafeSearchAnnotation describes the results of a SafeSearch detection on an image.
type SafeSearchAnnotation struct {
// Adult is the likelihood that the image contains adult content.
Adult Likelihood
// Spoof is the likelihood that an obvious modification was made to the
// image's canonical version to make it appear funny or offensive.
Spoof Likelihood
// Medical is the likelihood that this is a medical image.
Medical Likelihood
// Violence is the likelihood that this image represents violence.
Violence Likelihood
}
func safeSearchAnnotationFromProto(s *pb.SafeSearchAnnotation) *SafeSearchAnnotation {
if s == nil {
return nil
}
return &SafeSearchAnnotation{
Adult: Likelihood(s.Adult),
Spoof: Likelihood(s.Spoof),
Medical: Likelihood(s.Medical),
Violence: Likelihood(s.Violence),
}
}
// ImageProps describes properties of the image itself, like the dominant colors.
type ImageProps struct {
// DominantColors describes the dominant colors of the image.
DominantColors []*ColorInfo
}
func imagePropertiesFromProto(ip *pb.ImageProperties) *ImageProps {
if ip == nil || ip.DominantColors == nil {
return nil
}
var cinfos []*ColorInfo
for _, ci := range ip.DominantColors.Colors {
cinfos = append(cinfos, colorInfoFromProto(ci))
}
return &ImageProps{DominantColors: cinfos}
}
// WebDetection contains relevant information for the image from the Internet.
type WebDetection struct {
// Deduced entities from similar images on the Internet.
WebEntities []*WebEntity
// Fully matching images from the Internet.
// They're definite neardups and most often a copy of the query image with
// merely a size change.
FullMatchingImages []*WebImage
// Partial matching images from the Internet.
// Those images are similar enough to share some key-point features. For
// example an original image will likely have partial matching for its crops.
PartialMatchingImages []*WebImage
// Web pages containing the matching images from the Internet.
PagesWithMatchingImages []*WebPage
}
func webDetectionFromProto(p *pb.WebDetection) *WebDetection {
if p == nil {
return nil
}
var (
wes []*WebEntity
fmis, pmis []*WebImage
wps []*WebPage
)
for _, e := range p.WebEntities {
wes = append(wes, webEntityFromProto(e))
}
for _, m := range p.FullMatchingImages {
fmis = append(fmis, webImageFromProto(m))
}
for _, m := range p.PartialMatchingImages {
pmis = append(fmis, webImageFromProto(m))
}
for _, g := range p.PagesWithMatchingImages {
wps = append(wps, webPageFromProto(g))
}
return &WebDetection{
WebEntities: wes,
FullMatchingImages: fmis,
PartialMatchingImages: pmis,
PagesWithMatchingImages: wps,
}
}
// A WebEntity is an entity deduced from similar images on the Internet.
type WebEntity struct {
// Opaque entity ID.
ID string
// Overall relevancy score for the entity.
// Not normalized and not comparable across different image queries.
Score float32
// Canonical description of the entity, in English.
Description string
}
func webEntityFromProto(p *pb.WebDetection_WebEntity) *WebEntity {
return &WebEntity{
ID: p.EntityId,
Score: p.Score,
Description: p.Description,
}
}
// WebImage contains metadata for online images.
type WebImage struct {
// The result image URL.
URL string
// Overall relevancy score for the image.
// Not normalized and not comparable across different image queries.
Score float32
}
func webImageFromProto(p *pb.WebDetection_WebImage) *WebImage {
return &WebImage{
URL: p.Url,
Score: p.Score,
}
}
// A WebPage contains metadata for web pages.
type WebPage struct {
// The result web page URL.
URL string
// Overall relevancy score for the web page.
// Not normalized and not comparable across different image queries.
Score float32
}
func webPageFromProto(p *pb.WebDetection_WebPage) *WebPage {
return &WebPage{
URL: p.Url,
Score: p.Score,
}
}
// CropHint is a single crop hint that is used to generate a new crop when
// serving an image.
type CropHint struct {
// The bounding polygon for the crop region. The coordinates of the bounding
// box are in the original image's scale, as returned in `ImageParams`.
BoundingPoly []image.Point
// Confidence of this being a salient region. Range [0, 1].
Confidence float32
// Fraction of importance of this salient region with respect to the original
// image.
ImportanceFraction float32
}
func cropHintsFromProto(p *pb.CropHintsAnnotation) []*CropHint {
if p == nil {
return nil
}
var chs []*CropHint
for _, pch := range p.CropHints {
chs = append(chs, cropHintFromProto(pch))
}
return chs
}
func cropHintFromProto(pch *pb.CropHint) *CropHint {
return &CropHint{
BoundingPoly: boundingPolyFromProto(pch.BoundingPoly),
Confidence: pch.Confidence,
ImportanceFraction: pch.ImportanceFraction,
}
}