vision/annotations.go - third_party/github.com/googleapis/google-cloud-go - Git at Google

 // Copyright 2016 Google Inc. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package vision

 import (
 	"image"

 	"golang.org/x/text/language"
 	pb "google.golang.org/genproto/googleapis/cloud/vision/v1"
 	"google.golang.org/grpc"
 	"google.golang.org/grpc/codes"
 )

 // Annotations contains all the annotations performed by the API on a single image.
 // A nil field indicates either that the corresponding feature was not requested,
 // or that annotation failed for that feature.
 type Annotations struct {
 	// Faces holds the results of face detection.
 	Faces []*FaceAnnotation
 	// Landmarks holds the results of landmark detection.
 	Landmarks []*EntityAnnotation
 	// Logos holds the results of logo detection.
 	Logos []*EntityAnnotation
 	// Labels holds the results of label detection.
 	Labels []*EntityAnnotation
 	// Texts holds the results of text detection.
 	Texts []*EntityAnnotation
 	// FullText holds the results of full text (OCR) detection.
 	FullText *TextAnnotation
 	// SafeSearch holds the results of safe-search detection.
 	SafeSearch *SafeSearchAnnotation
 	// ImageProps contains properties of the annotated image.
 	ImageProps *ImageProps
 	// Web contains web annotations for the image.
 	Web *WebDetection
 	// CropHints contains crop hints for the image.
 	CropHints []*CropHint

 	// If non-nil, then one or more of the attempted annotations failed.
 	// Non-nil annotations are guaranteed to be correct, even if Error is
 	// non-nil.
 	Error error
 }

 func annotationsFromProto(res *pb.AnnotateImageResponse) *Annotations {
 	as := &Annotations{}
 	for _, a := range res.FaceAnnotations {
 		as.Faces = append(as.Faces, faceAnnotationFromProto(a))
 	}
 	for _, a := range res.LandmarkAnnotations {
 		as.Landmarks = append(as.Landmarks, entityAnnotationFromProto(a))
 	}
 	for _, a := range res.LogoAnnotations {
 		as.Logos = append(as.Logos, entityAnnotationFromProto(a))
 	}
 	for _, a := range res.LabelAnnotations {
 		as.Labels = append(as.Labels, entityAnnotationFromProto(a))
 	}
 	for _, a := range res.TextAnnotations {
 		as.Texts = append(as.Texts, entityAnnotationFromProto(a))
 	}
 	as.FullText = textAnnotationFromProto(res.FullTextAnnotation)
 	as.SafeSearch = safeSearchAnnotationFromProto(res.SafeSearchAnnotation)
 	as.ImageProps = imagePropertiesFromProto(res.ImagePropertiesAnnotation)
 	as.Web = webDetectionFromProto(res.WebDetection)
 	as.CropHints = cropHintsFromProto(res.CropHintsAnnotation)
 	if res.Error != nil {
 		// res.Error is a google.rpc.Status. Convert to a Go error. Use a gRPC
 		// error because it preserves the code as a separate field.
 		// TODO(jba): preserve the details field.
 		as.Error = grpc.Errorf(codes.Code(res.Error.Code), "%s", res.Error.Message)
 	}
 	return as
 }

 // A FaceAnnotation describes the results of face detection on an image.
 type FaceAnnotation struct {
 	// BoundingPoly is the bounding polygon around the face. The coordinates of
 	// the bounding box are in the original image's scale, as returned in
 	// ImageParams. The bounding box is computed to "frame" the face in
 	// accordance with human expectations. It is based on the landmarker
 	// results. Note that one or more x and/or y coordinates may not be
 	// generated in the BoundingPoly (the polygon will be unbounded) if only a
 	// partial face appears in the image to be annotated.
 	BoundingPoly []image.Point

 	// FDBoundingPoly is tighter than BoundingPoly, and
 	// encloses only the skin part of the face. Typically, it is used to
 	// eliminate the face from any image analysis that detects the "amount of
 	// skin" visible in an image. It is not based on the landmarker results, only
 	// on the initial face detection, hence the fd (face detection) prefix.
 	FDBoundingPoly []image.Point

 	// Landmarks are detected face landmarks.
 	Face FaceLandmarks

 	// RollAngle indicates the amount of clockwise/anti-clockwise rotation of
 	// the face relative to the image vertical, about the axis perpendicular to
 	// the face. Range [-180,180].
 	RollAngle float32

 	// PanAngle is the yaw angle: the leftward/rightward angle that the face is
 	// pointing, relative to the vertical plane perpendicular to the image. Range
 	// [-180,180].
 	PanAngle float32

 	// TiltAngle is the pitch angle: the upwards/downwards angle that the face is
 	// pointing relative to the image's horizontal plane. Range [-180,180].
 	TiltAngle float32

 	// DetectionConfidence is the detection confidence. The range is [0, 1].
 	DetectionConfidence float32

 	// LandmarkingConfidence is the face landmarking confidence. The range is [0, 1].
 	LandmarkingConfidence float32

 	// Likelihoods expresses the likelihood of various aspects of the face.
 	Likelihoods *FaceLikelihoods
 }

 func faceAnnotationFromProto(pfa *pb.FaceAnnotation) *FaceAnnotation {
 	fa := &FaceAnnotation{
 		BoundingPoly:          boundingPolyFromProto(pfa.BoundingPoly),
 		FDBoundingPoly:        boundingPolyFromProto(pfa.FdBoundingPoly),
 		RollAngle:             pfa.RollAngle,
 		PanAngle:              pfa.PanAngle,
 		TiltAngle:             pfa.TiltAngle,
 		DetectionConfidence:   pfa.DetectionConfidence,
 		LandmarkingConfidence: pfa.LandmarkingConfidence,
 		Likelihoods: &FaceLikelihoods{
 			Joy:          Likelihood(pfa.JoyLikelihood),
 			Sorrow:       Likelihood(pfa.SorrowLikelihood),
 			Anger:        Likelihood(pfa.AngerLikelihood),
 			Surprise:     Likelihood(pfa.SurpriseLikelihood),
 			UnderExposed: Likelihood(pfa.UnderExposedLikelihood),
 			Blurred:      Likelihood(pfa.BlurredLikelihood),
 			Headwear:     Likelihood(pfa.HeadwearLikelihood),
 		},
 	}
 	populateFaceLandmarks(pfa.Landmarks, &fa.Face)
 	return fa
 }

 // An EntityAnnotation describes the results of a landmark, label, logo or text
 // detection on an image.
 type EntityAnnotation struct {
 	// ID is an opaque entity ID. Some IDs might be available in Knowledge Graph(KG).
 	// For more details on KG please see:
 	// https://developers.google.com/knowledge-graph/
 	ID string

 	// Locale is the language code for the locale in which the entity textual
 	// description (next field) is expressed.
 	Locale string

 	// Description is the entity textual description, expressed in the language of Locale.
 	Description string

 	// Score is the overall score of the result. Range [0, 1].
 	Score float32

 	// Confidence is the accuracy of the entity detection in an image.
 	// For example, for an image containing the Eiffel Tower, this field represents
 	// the confidence that there is a tower in the query image. Range [0, 1].
 	Confidence float32

 	// Topicality is the relevancy of the ICA (Image Content Annotation) label to the
 	// image. For example, the relevancy of 'tower' to an image containing
 	// 'Eiffel Tower' is likely higher than an image containing a distant towering
 	// building, though the confidence that there is a tower may be the same.
 	// Range [0, 1].
 	Topicality float32

 	// BoundingPoly is the image region to which this entity belongs. Not filled currently
 	// for label detection. For text detection, BoundingPolys
 	// are produced for the entire text detected in an image region, followed by
 	// BoundingPolys for each word within the detected text.
 	BoundingPoly []image.Point

 	// Locations contains the location information for the detected entity.
 	// Multiple LatLng structs can be present since one location may indicate the
 	// location of the scene in the query image, and another the location of the
 	// place where the query image was taken. Location information is usually
 	// present for landmarks.
 	Locations []LatLng

 	// Properties are additional optional Property fields.
 	// For example a different kind of score or string that qualifies the entity.
 	Properties []Property
 }

 func entityAnnotationFromProto(e *pb.EntityAnnotation) *EntityAnnotation {
 	var locs []LatLng
 	for _, li := range e.Locations {
 		locs = append(locs, latLngFromProto(li.LatLng))
 	}
 	var props []Property
 	for _, p := range e.Properties {
 		props = append(props, propertyFromProto(p))
 	}
 	return &EntityAnnotation{
 		ID:           e.Mid,
 		Locale:       e.Locale,
 		Description:  e.Description,
 		Score:        e.Score,
 		Confidence:   e.Confidence,
 		Topicality:   e.Topicality,
 		BoundingPoly: boundingPolyFromProto(e.BoundingPoly),
 		Locations:    locs,
 		Properties:   props,
 	}
 }

 // TextAnnotation contains a structured representation of OCR extracted text.
 // The hierarchy of an OCR extracted text structure looks like:
 //     TextAnnotation -> Page -> Block -> Paragraph -> Word -> Symbol
 // Each structural component, starting from Page, may further have its own
 // properties. Properties describe detected languages, breaks etc.
 type TextAnnotation struct {
 	// List of pages detected by OCR.
 	Pages []*Page
 	// UTF-8 text detected on the pages.
 	Text string
 }

 func textAnnotationFromProto(pta *pb.TextAnnotation) *TextAnnotation {
 	if pta == nil {
 		return nil
 	}
 	var pages []*Page
 	for _, p := range pta.Pages {
 		pages = append(pages, pageFromProto(p))
 	}
 	return &TextAnnotation{
 		Pages: pages,
 		Text:  pta.Text,
 	}
 }

 // A Page is a page of text detected from OCR.
 type Page struct {
 	// Additional information detected on the page.
 	Properties *TextProperties
 	// Page width in pixels.
 	Width int32
 	// Page height in pixels.
 	Height int32
 	// List of blocks of text, images etc on this page.
 	Blocks []*Block
 }

 func pageFromProto(p *pb.Page) *Page {
 	if p == nil {
 		return nil
 	}
 	var blocks []*Block
 	for _, b := range p.Blocks {
 		blocks = append(blocks, blockFromProto(b))
 	}
 	return &Page{
 		Properties: textPropertiesFromProto(p.Property),
 		Width:      p.Width,
 		Height:     p.Height,
 		Blocks:     blocks,
 	}
 }

 // A Block is a logical element on the page.
 type Block struct {
 	// Additional information detected for the block.
 	Properties *TextProperties
 	// The bounding box for the block.
 	// The vertices are in the order of top-left, top-right, bottom-right,
 	// bottom-left. When a rotation of the bounding box is detected the rotation
 	// is represented as around the top-left corner as defined when the text is
 	// read in the 'natural' orientation.
 	// For example:
 	//   * when the text is horizontal it might look like:
 	//      0----1
 	//      |    |
 	//      3----2
 	//   * when it's rotated 180 degrees around the top-left corner it becomes:
 	//      2----3
 	//      |    |
 	//      1----0
 	//   and the vertice order will still be (0, 1, 2, 3).
 	BoundingBox []image.Point
 	// List of paragraphs in this block (if this blocks is of type text).
 	Paragraphs []*Paragraph
 	// Detected block type (text, image etc) for this block.
 	BlockType BlockType
 }

 // A BlockType represents the kind of Block (text, image, etc.)
 type BlockType int

 const (
 	// Unknown block type.
 	UnknownBlock BlockType = BlockType(pb.Block_UNKNOWN)
 	// Regular text block.
 	TextBlock BlockType = BlockType(pb.Block_TEXT)
 	// Table block.
 	TableBlock BlockType = BlockType(pb.Block_TABLE)
 	// Image block.
 	PictureBlock BlockType = BlockType(pb.Block_PICTURE)
 	// Horizontal/vertical line box.
 	RulerBlock BlockType = BlockType(pb.Block_RULER)
 	// Barcode block.
 	BarcodeBlock BlockType = BlockType(pb.Block_BARCODE)
 )

 func blockFromProto(p *pb.Block) *Block {
 	if p == nil {
 		return nil
 	}
 	var paras []*Paragraph
 	for _, pa := range p.Paragraphs {
 		paras = append(paras, paragraphFromProto(pa))
 	}
 	return &Block{
 		Properties:  textPropertiesFromProto(p.Property),
 		BoundingBox: boundingPolyFromProto(p.BoundingBox),
 		Paragraphs:  paras,
 		BlockType:   BlockType(p.BlockType),
 	}
 }

 // A Paragraph is a structural unit of text representing a number of words in
 // certain order.
 type Paragraph struct {
 	// Additional information detected for the paragraph.
 	Properties *TextProperties
 	// The bounding box for the paragraph.
 	// The vertices are in the order of top-left, top-right, bottom-right,
 	// bottom-left. When a rotation of the bounding box is detected the rotation
 	// is represented as around the top-left corner as defined when the text is
 	// read in the 'natural' orientation.
 	// For example:
 	//   * when the text is horizontal it might look like:
 	//      0----1
 	//      |    |
 	//      3----2
 	//   * when it's rotated 180 degrees around the top-left corner it becomes:
 	//      2----3
 	//      |    |
 	//      1----0
 	//   and the vertice order will still be (0, 1, 2, 3).
 	BoundingBox []image.Point
 	// List of words in this paragraph.
 	Words []*Word
 }

 func paragraphFromProto(p *pb.Paragraph) *Paragraph {
 	if p == nil {
 		return nil
 	}
 	var words []*Word
 	for _, w := range p.Words {
 		words = append(words, wordFromProto(w))
 	}
 	return &Paragraph{
 		Properties:  textPropertiesFromProto(p.Property),
 		BoundingBox: boundingPolyFromProto(p.BoundingBox),
 		Words:       words,
 	}
 }

 // A Word is a word in a text document.
 type Word struct {
 	// Additional information detected for the word.
 	Properties *TextProperties
 	// The bounding box for the word.
 	// The vertices are in the order of top-left, top-right, bottom-right,
 	// bottom-left. When a rotation of the bounding box is detected the rotation
 	// is represented as around the top-left corner as defined when the text is
 	// read in the 'natural' orientation.
 	// For example:
 	//   * when the text is horizontal it might look like:
 	//      0----1
 	//      |    |
 	//      3----2
 	//   * when it's rotated 180 degrees around the top-left corner it becomes:
 	//      2----3
 	//      |    |
 	//      1----0
 	//   and the vertice order will still be (0, 1, 2, 3).
 	BoundingBox []image.Point
 	// List of symbols in the word.
 	// The order of the symbols follows the natural reading order.
 	Symbols []*Symbol
 }

 func wordFromProto(p *pb.Word) *Word {
 	if p == nil {
 		return nil
 	}
 	var syms []*Symbol
 	for _, s := range p.Symbols {
 		syms = append(syms, symbolFromProto(s))
 	}
 	return &Word{
 		Properties:  textPropertiesFromProto(p.Property),
 		BoundingBox: boundingPolyFromProto(p.BoundingBox),
 		Symbols:     syms,
 	}
 }

 // A Symbol is a symbol in a text document.
 type Symbol struct {
 	// Additional information detected for the symbol.
 	Properties *TextProperties
 	// The bounding box for the symbol.
 	// The vertices are in the order of top-left, top-right, bottom-right,
 	// bottom-left. When a rotation of the bounding box is detected the rotation
 	// is represented as around the top-left corner as defined when the text is
 	// read in the 'natural' orientation.
 	// For example:
 	//   * when the text is horizontal it might look like:
 	//      0----1
 	//      |    |
 	//      3----2
 	//   * when it's rotated 180 degrees around the top-left corner it becomes:
 	//      2----3
 	//      |    |
 	//      1----0
 	//   and the vertice order will still be (0, 1, 2, 3).
 	BoundingBox []image.Point
 	// The actual UTF-8 representation of the symbol.
 	Text string
 }

 func symbolFromProto(p *pb.Symbol) *Symbol {
 	if p == nil {
 		return nil
 	}
 	return &Symbol{
 		Properties:  textPropertiesFromProto(p.Property),
 		BoundingBox: boundingPolyFromProto(p.BoundingBox),
 		Text:        p.Text,
 	}
 }

 // TextProperties contains additional information about an OCR structural component.
 type TextProperties struct {
 	// A list of detected languages together with confidence.
 	DetectedLanguages []*DetectedLanguage
 	// Detected start or end of a text segment.
 	DetectedBreak *DetectedBreak
 }

 // Detected language for a structural component.
 type DetectedLanguage struct {
 	// The BCP-47 language code, such as "en-US" or "sr-Latn".
 	Code language.Tag
 	// The confidence of the detected language, in the range [0, 1].
 	Confidence float32
 }

 // DetectedBreak is the detected start or end of a structural component.
 type DetectedBreak struct {
 	// The type of break.
 	Type DetectedBreakType
 	// True if break prepends the element.
 	IsPrefix bool
 }

 type DetectedBreakType int

 const (
 	// Unknown break label type.
 	UnknownBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_UNKNOWN)
 	// Regular space.
 	SpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SPACE)
 	// Sure space (very wide).
 	SureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_SURE_SPACE)
 	// Line-wrapping break.
 	EOLSureSpaceBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_EOL_SURE_SPACE)
 	// End-line hyphen that is not present in text; does not co-occur with SPACE, LEADER_SPACE, or LINE_BREAK.
 	HyphenBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_HYPHEN)
 	// Line break that ends a paragraph.
 	LineBreak = DetectedBreakType(pb.TextAnnotation_DetectedBreak_LINE_BREAK)
 )

 func textPropertiesFromProto(p *pb.TextAnnotation_TextProperty) *TextProperties {
 	var dls []*DetectedLanguage
 	for _, dl := range p.DetectedLanguages {
 		tag, _ := language.Parse(dl.LanguageCode)
 		// Ignore error. If err != nil the returned tag will not be garbage,
 		// but a best-effort attempt at a parse. At worst it will be
 		// language.Und, the documented "undefined" Tag.
 		dls = append(dls, &DetectedLanguage{Code: tag, Confidence: dl.Confidence})
 	}
 	var db *DetectedBreak
 	if p.DetectedBreak != nil {
 		db = &DetectedBreak{
 			Type:     DetectedBreakType(p.DetectedBreak.Type),
 			IsPrefix: p.DetectedBreak.IsPrefix,
 		}
 	}
 	return &TextProperties{
 		DetectedLanguages: dls,
 		DetectedBreak:     db,
 	}
 }

 // SafeSearchAnnotation describes the results of a SafeSearch detection on an image.
 type SafeSearchAnnotation struct {
 	// Adult is the likelihood that the image contains adult content.
 	Adult Likelihood

 	// Spoof is the likelihood that an obvious modification was made to the
 	// image's canonical version to make it appear funny or offensive.
 	Spoof Likelihood

 	// Medical is the likelihood that this is a medical image.
 	Medical Likelihood

 	// Violence is the likelihood that this image represents violence.
 	Violence Likelihood
 }

 func safeSearchAnnotationFromProto(s *pb.SafeSearchAnnotation) *SafeSearchAnnotation {
 	if s == nil {
 		return nil
 	}
 	return &SafeSearchAnnotation{
 		Adult:    Likelihood(s.Adult),
 		Spoof:    Likelihood(s.Spoof),
 		Medical:  Likelihood(s.Medical),
 		Violence: Likelihood(s.Violence),
 	}
 }

 // ImageProps describes properties of the image itself, like the dominant colors.
 type ImageProps struct {
 	// DominantColors describes the dominant colors of the image.
 	DominantColors []*ColorInfo
 }

 func imagePropertiesFromProto(ip *pb.ImageProperties) *ImageProps {
 	if ip == nil || ip.DominantColors == nil {
 		return nil
 	}
 	var cinfos []*ColorInfo
 	for _, ci := range ip.DominantColors.Colors {
 		cinfos = append(cinfos, colorInfoFromProto(ci))
 	}
 	return &ImageProps{DominantColors: cinfos}
 }

 // WebDetection contains relevant information for the image from the Internet.
 type WebDetection struct {
 	// Deduced entities from similar images on the Internet.
 	WebEntities []*WebEntity
 	// Fully matching images from the Internet.
 	// They're definite neardups and most often a copy of the query image with
 	// merely a size change.
 	FullMatchingImages []*WebImage
 	// Partial matching images from the Internet.
 	// Those images are similar enough to share some key-point features. For
 	// example an original image will likely have partial matching for its crops.
 	PartialMatchingImages []*WebImage
 	// Web pages containing the matching images from the Internet.
 	PagesWithMatchingImages []*WebPage
 }

 func webDetectionFromProto(p *pb.WebDetection) *WebDetection {
 	if p == nil {
 		return nil
 	}
 	var (
 		wes        []*WebEntity
 		fmis, pmis []*WebImage
 		wps        []*WebPage
 	)
 	for _, e := range p.WebEntities {
 		wes = append(wes, webEntityFromProto(e))
 	}
 	for _, m := range p.FullMatchingImages {
 		fmis = append(fmis, webImageFromProto(m))
 	}
 	for _, m := range p.PartialMatchingImages {
 		pmis = append(fmis, webImageFromProto(m))
 	}
 	for _, g := range p.PagesWithMatchingImages {
 		wps = append(wps, webPageFromProto(g))
 	}
 	return &WebDetection{
 		WebEntities:             wes,
 		FullMatchingImages:      fmis,
 		PartialMatchingImages:   pmis,
 		PagesWithMatchingImages: wps,
 	}
 }

 // A WebEntity is an entity deduced from similar images on the Internet.
 type WebEntity struct {
 	// Opaque entity ID.
 	ID string
 	// Overall relevancy score for the entity.
 	// Not normalized and not comparable across different image queries.
 	Score float32
 	// Canonical description of the entity, in English.
 	Description string
 }

 func webEntityFromProto(p *pb.WebDetection_WebEntity) *WebEntity {
 	return &WebEntity{
 		ID:          p.EntityId,
 		Score:       p.Score,
 		Description: p.Description,
 	}
 }

 // WebImage contains metadata for online images.
 type WebImage struct {
 	// The result image URL.
 	URL string
 	// Overall relevancy score for the image.
 	// Not normalized and not comparable across different image queries.
 	Score float32
 }

 func webImageFromProto(p *pb.WebDetection_WebImage) *WebImage {
 	return &WebImage{
 		URL:   p.Url,
 		Score: p.Score,
 	}
 }

 // A WebPage contains metadata for web pages.
 type WebPage struct {
 	// The result web page URL.
 	URL string
 	// Overall relevancy score for the web page.
 	// Not normalized and not comparable across different image queries.
 	Score float32
 }

 func webPageFromProto(p *pb.WebDetection_WebPage) *WebPage {
 	return &WebPage{
 		URL:   p.Url,
 		Score: p.Score,
 	}
 }

 // CropHint is a single crop hint that is used to generate a new crop when
 // serving an image.
 type CropHint struct {
 	// The bounding polygon for the crop region. The coordinates of the bounding
 	// box are in the original image's scale, as returned in `ImageParams`.
 	BoundingPoly []image.Point
 	// Confidence of this being a salient region.  Range [0, 1].
 	Confidence float32
 	// Fraction of importance of this salient region with respect to the original
 	// image.
 	ImportanceFraction float32
 }

 func cropHintsFromProto(p *pb.CropHintsAnnotation) []*CropHint {
 	if p == nil {
 		return nil
 	}
 	var chs []*CropHint
 	for _, pch := range p.CropHints {
 		chs = append(chs, cropHintFromProto(pch))
 	}
 	return chs
 }

 func cropHintFromProto(pch *pb.CropHint) *CropHint {
 	return &CropHint{
 		BoundingPoly:       boundingPolyFromProto(pch.BoundingPoly),
 		Confidence:         pch.Confidence,
 		ImportanceFraction: pch.ImportanceFraction,
 	}
 }