blob: 22bc945739a546584c3fa4eb7eb0f5caa6fa9a47 [file] [log] [blame]
// Copyright 2015 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package bigquery
import (
"encoding/json"
"errors"
"fmt"
"reflect"
"sync"
bq "google.golang.org/api/bigquery/v2"
)
// Schema describes the fields in a table or query result.
type Schema []*FieldSchema
// Relax returns a version of the schema where no fields are marked
// as Required.
func (s Schema) Relax() Schema {
var out Schema
for _, v := range s {
relaxed := &FieldSchema{
Name: v.Name,
Description: v.Description,
Repeated: v.Repeated,
Required: false,
Type: v.Type,
Schema: v.Schema.Relax(),
}
out = append(out, relaxed)
}
return out
}
// ToJSONFields exposes the schema as a JSON array of
// TableFieldSchema objects: https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema
//
// Generally this isn't needed for direct usage of this library, but is
// provided for use cases where you're interacting with other tools
// that consume the underlying API representation directly such as the
// BQ CLI tool.
func (s Schema) ToJSONFields() ([]byte, error) {
var rawSchema []*bq.TableFieldSchema
for _, f := range s {
rawSchema = append(rawSchema, f.toBQ())
}
// Use json.MarshalIndent to make the output more human-readable.
return json.MarshalIndent(rawSchema, "", " ")
}
// FieldSchema describes a single field.
type FieldSchema struct {
// The field name.
// Must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_),
// and must start with a letter or underscore.
// The maximum length is 128 characters.
Name string
// A description of the field. The maximum length is 16,384 characters.
Description string
// Whether the field may contain multiple values.
Repeated bool
// Whether the field is required. Ignored if Repeated is true.
Required bool
// The field data type. If Type is Record, then this field contains a nested schema,
// which is described by Schema.
Type FieldType
// Annotations for enforcing column-level security constraints.
PolicyTags *PolicyTagList
// Describes the nested schema if Type is set to Record.
Schema Schema
// Maximum length of the field for STRING or BYTES type.
//
// It is invalid to set value for types other than STRING or BYTES.
//
// For STRING type, this represents the maximum UTF-8 length of strings
// allowed in the field. For BYTES type, this represents the maximum
// number of bytes in the field.
MaxLength int64
// Precision can be used to constrain the maximum number of
// total digits allowed for NUMERIC or BIGNUMERIC types.
//
// It is invalid to set values for Precision for types other than
// NUMERIC or BIGNUMERIC.
//
// For NUMERIC type, acceptable values for Precision must
// be: 1 ≤ (Precision - Scale) ≤ 29. Values for Scale
// must be: 0 ≤ Scale ≤ 9.
//
// For BIGNUMERIC type, acceptable values for Precision must
// be: 1 ≤ (Precision - Scale) ≤ 38. Values for Scale
// must be: 0 ≤ Scale ≤ 38.
Precision int64
// Scale can be used to constrain the maximum number of digits
// in the fractional part of a NUMERIC or BIGNUMERIC type.
//
// If the Scale value is set, the Precision value must be set as well.
//
// It is invalid to set values for Scale for types other than
// NUMERIC or BIGNUMERIC.
//
// See the Precision field for additional guidance about valid values.
Scale int64
// DefaultValueExpression is used to specify the default value of a field
// using a SQL expression. It can only be set for top level fields (columns).
//
// You can use struct or array expression to specify default value for the
// entire struct or array. The valid SQL expressions are:
//
// - Literals for all data types, including STRUCT and ARRAY.
// - The following functions:
// - CURRENT_TIMESTAMP
// - CURRENT_TIME
// - CURRENT_DATE
// - CURRENT_DATETIME
// - GENERATE_UUID
// - RAND
// - SESSION_USER
// - ST_GEOGPOINT
// - Struct or array composed with the above allowed functions, for example:
// [CURRENT_DATE(), DATE '2020-01-01']"
DefaultValueExpression string
// Collation can be set only when the type of field is STRING.
// The following values are supported:
// - 'und:ci': undetermined locale, case insensitive.
// - '': empty string. Default to case-sensitive behavior.
// More information: https://cloud.google.com/bigquery/docs/reference/standard-sql/collation-concepts
Collation string
// Information about the range.
// If the type is RANGE, this field is required.
RangeElementType *RangeElementType
}
func (fs *FieldSchema) toBQ() *bq.TableFieldSchema {
tfs := &bq.TableFieldSchema{
Description: fs.Description,
Name: fs.Name,
Type: string(fs.Type),
PolicyTags: fs.PolicyTags.toBQ(),
MaxLength: fs.MaxLength,
Precision: fs.Precision,
Scale: fs.Scale,
DefaultValueExpression: fs.DefaultValueExpression,
Collation: string(fs.Collation),
RangeElementType: fs.RangeElementType.toBQ(),
}
if fs.Repeated {
tfs.Mode = "REPEATED"
} else if fs.Required {
tfs.Mode = "REQUIRED"
} // else leave as default, which is interpreted as NULLABLE.
for _, f := range fs.Schema {
tfs.Fields = append(tfs.Fields, f.toBQ())
}
return tfs
}
// RangeElementType describes information about the range type.
type RangeElementType struct {
// The subtype of the RANGE, if the type of this field is RANGE.
// Possible values for the field element type of a RANGE include:
// DATE, DATETIME, or TIMESTAMP.
Type FieldType
}
func (rt *RangeElementType) toBQ() *bq.TableFieldSchemaRangeElementType {
if rt == nil {
return nil
}
return &bq.TableFieldSchemaRangeElementType{
Type: string(rt.Type),
}
}
func bqToRangeElementType(rt *bq.TableFieldSchemaRangeElementType) *RangeElementType {
if rt == nil {
return nil
}
return &RangeElementType{
Type: FieldType(rt.Type),
}
}
// PolicyTagList represents the annotations on a schema column for enforcing column-level security.
// For more information, see https://cloud.google.com/bigquery/docs/column-level-security-intro
type PolicyTagList struct {
Names []string
}
func (ptl *PolicyTagList) toBQ() *bq.TableFieldSchemaPolicyTags {
if ptl == nil {
return nil
}
return &bq.TableFieldSchemaPolicyTags{
Names: ptl.Names,
}
}
func bqToPolicyTagList(pt *bq.TableFieldSchemaPolicyTags) *PolicyTagList {
if pt == nil {
return nil
}
return &PolicyTagList{
Names: pt.Names,
}
}
func (s Schema) toBQ() *bq.TableSchema {
var fields []*bq.TableFieldSchema
for _, f := range s {
fields = append(fields, f.toBQ())
}
return &bq.TableSchema{Fields: fields}
}
func bqToFieldSchema(tfs *bq.TableFieldSchema) *FieldSchema {
fs := &FieldSchema{
Description: tfs.Description,
Name: tfs.Name,
Repeated: tfs.Mode == "REPEATED",
Required: tfs.Mode == "REQUIRED",
Type: FieldType(tfs.Type),
PolicyTags: bqToPolicyTagList(tfs.PolicyTags),
MaxLength: tfs.MaxLength,
Precision: tfs.Precision,
Scale: tfs.Scale,
DefaultValueExpression: tfs.DefaultValueExpression,
Collation: tfs.Collation,
RangeElementType: bqToRangeElementType(tfs.RangeElementType),
}
for _, f := range tfs.Fields {
fs.Schema = append(fs.Schema, bqToFieldSchema(f))
}
return fs
}
func bqToSchema(ts *bq.TableSchema) Schema {
if ts == nil {
return nil
}
var s Schema
for _, f := range ts.Fields {
s = append(s, bqToFieldSchema(f))
}
return s
}
// FieldType is the type of field.
type FieldType string
const (
// StringFieldType is a string field type.
StringFieldType FieldType = "STRING"
// BytesFieldType is a bytes field type.
BytesFieldType FieldType = "BYTES"
// IntegerFieldType is a integer field type.
IntegerFieldType FieldType = "INTEGER"
// FloatFieldType is a float field type.
FloatFieldType FieldType = "FLOAT"
// BooleanFieldType is a boolean field type.
BooleanFieldType FieldType = "BOOLEAN"
// TimestampFieldType is a timestamp field type.
TimestampFieldType FieldType = "TIMESTAMP"
// RecordFieldType is a record field type. It is typically used to create columns with repeated or nested data.
RecordFieldType FieldType = "RECORD"
// DateFieldType is a date field type.
DateFieldType FieldType = "DATE"
// TimeFieldType is a time field type.
TimeFieldType FieldType = "TIME"
// DateTimeFieldType is a datetime field type.
DateTimeFieldType FieldType = "DATETIME"
// NumericFieldType is a numeric field type. Numeric types include integer types, floating point types and the
// NUMERIC data type.
NumericFieldType FieldType = "NUMERIC"
// GeographyFieldType is a string field type. Geography types represent a set of points
// on the Earth's surface, represented in Well Known Text (WKT) format.
GeographyFieldType FieldType = "GEOGRAPHY"
// BigNumericFieldType is a numeric field type that supports values of larger precision
// and scale than the NumericFieldType.
BigNumericFieldType FieldType = "BIGNUMERIC"
// IntervalFieldType is a representation of a duration or an amount of time.
IntervalFieldType FieldType = "INTERVAL"
// JSONFieldType is a representation of a json object.
JSONFieldType FieldType = "JSON"
// RangeFieldType represents a continuous range of values.
RangeFieldType FieldType = "RANGE"
)
var (
errEmptyJSONSchema = errors.New("bigquery: empty JSON schema")
fieldTypes = map[FieldType]bool{
StringFieldType: true,
BytesFieldType: true,
IntegerFieldType: true,
FloatFieldType: true,
BooleanFieldType: true,
TimestampFieldType: true,
RecordFieldType: true,
DateFieldType: true,
TimeFieldType: true,
DateTimeFieldType: true,
NumericFieldType: true,
GeographyFieldType: true,
BigNumericFieldType: true,
IntervalFieldType: true,
JSONFieldType: true,
RangeFieldType: true,
}
// The API will accept alias names for the types based on the Standard SQL type names.
fieldAliases = map[FieldType]FieldType{
"BOOL": BooleanFieldType,
"FLOAT64": FloatFieldType,
"INT64": IntegerFieldType,
"STRUCT": RecordFieldType,
"DECIMAL": NumericFieldType,
"BIGDECIMAL": BigNumericFieldType,
}
)
var typeOfByteSlice = reflect.TypeOf([]byte{})
// InferSchema tries to derive a BigQuery schema from the supplied struct value.
// Each exported struct field is mapped to a field in the schema.
//
// The following BigQuery types are inferred from the corresponding Go types.
// (This is the same mapping as that used for RowIterator.Next.) Fields inferred
// from these types are marked required (non-nullable).
//
// STRING string
// BOOL bool
// INTEGER int, int8, int16, int32, int64, uint8, uint16, uint32
// FLOAT float32, float64
// BYTES []byte
// TIMESTAMP time.Time
// DATE civil.Date
// TIME civil.Time
// DATETIME civil.DateTime
// NUMERIC *big.Rat
// JSON map[string]interface{}
//
// The big.Rat type supports numbers of arbitrary size and precision. Values
// will be rounded to 9 digits after the decimal point before being transmitted
// to BigQuery. See https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#numeric-type
// for more on NUMERIC.
//
// A Go slice or array type is inferred to be a BigQuery repeated field of the
// element type. The element type must be one of the above listed types.
//
// Due to lack of unique native Go type for GEOGRAPHY, there is no schema
// inference to GEOGRAPHY at this time.
//
// This package also provides some value types for expressing the corresponding SQL types.
//
// INTERVAL *IntervalValue
// RANGE *RangeValue
//
// In the case of RANGE types, a RANGE represents a continuous set of values of a given
// element type (DATE, DATETIME, or TIMESTAMP). InferSchema does not attempt to determine
// the element type, as it uses generic Value types to denote the start/end of the range.
//
// Nullable fields are inferred from the NullXXX types, declared in this package:
//
// STRING NullString
// BOOL NullBool
// INTEGER NullInt64
// FLOAT NullFloat64
// TIMESTAMP NullTimestamp
// DATE NullDate
// TIME NullTime
// DATETIME NullDateTime
// GEOGRAPHY NullGeography
//
// For a nullable BYTES field, use the type []byte and tag the field "nullable" (see below).
// For a nullable NUMERIC field, use the type *big.Rat and tag the field "nullable".
//
// A struct field that is of struct type is inferred to be a required field of type
// RECORD with a schema inferred recursively. For backwards compatibility, a field of
// type pointer to struct is also inferred to be required. To get a nullable RECORD
// field, use the "nullable" tag (see below).
//
// InferSchema returns an error if any of the examined fields is of type uint,
// uint64, uintptr, map, interface, complex64, complex128, func, or chan. Future
// versions may handle these cases without error.
//
// Recursively defined structs are also disallowed.
//
// Struct fields may be tagged in a way similar to the encoding/json package.
// A tag of the form
//
// bigquery:"name"
//
// uses "name" instead of the struct field name as the BigQuery field name.
// A tag of the form
//
// bigquery:"-"
//
// omits the field from the inferred schema.
// The "nullable" option marks the field as nullable (not required). It is only
// needed for []byte, *big.Rat and pointer-to-struct fields, and cannot appear on other
// fields. In this example, the Go name of the field is retained:
//
// bigquery:",nullable"
func InferSchema(st interface{}) (Schema, error) {
return inferSchemaReflectCached(reflect.TypeOf(st))
}
var schemaCache sync.Map
type cacheVal struct {
schema Schema
err error
}
func inferSchemaReflectCached(t reflect.Type) (Schema, error) {
var cv cacheVal
v, ok := schemaCache.Load(t)
if ok {
cv = v.(cacheVal)
} else {
s, err := inferSchemaReflect(t)
cv = cacheVal{s, err}
schemaCache.Store(t, cv)
}
return cv.schema, cv.err
}
func inferSchemaReflect(t reflect.Type) (Schema, error) {
rec, err := hasRecursiveType(t, nil)
if err != nil {
return nil, err
}
if rec {
return nil, fmt.Errorf("bigquery: schema inference for recursive type %s", t)
}
return inferStruct(t)
}
func inferStruct(t reflect.Type) (Schema, error) {
switch t.Kind() {
case reflect.Ptr:
if t.Elem().Kind() != reflect.Struct {
return nil, noStructError{t}
}
t = t.Elem()
fallthrough
case reflect.Struct:
return inferFields(t)
default:
return nil, noStructError{t}
}
}
// inferFieldSchema infers the FieldSchema for a Go type
func inferFieldSchema(fieldName string, rt reflect.Type, nullable, json bool) (*FieldSchema, error) {
// Only []byte and struct pointers can be tagged nullable.
if nullable && !(rt == typeOfByteSlice || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) {
return nil, badNullableError{fieldName, rt}
}
// Only structs and struct pointers can be tagged as json.
if json && !(rt.Kind() == reflect.Struct || rt.Kind() == reflect.Ptr && rt.Elem().Kind() == reflect.Struct) {
return nil, badJSONError{fieldName, rt}
}
switch rt {
case typeOfByteSlice:
return &FieldSchema{Required: !nullable, Type: BytesFieldType}, nil
case typeOfGoTime:
return &FieldSchema{Required: true, Type: TimestampFieldType}, nil
case typeOfDate:
return &FieldSchema{Required: true, Type: DateFieldType}, nil
case typeOfTime:
return &FieldSchema{Required: true, Type: TimeFieldType}, nil
case typeOfDateTime:
return &FieldSchema{Required: true, Type: DateTimeFieldType}, nil
case typeOfRat:
// We automatically infer big.Rat values as NUMERIC as we cannot
// determine precision/scale from the type. Users who want the
// larger precision of BIGNUMERIC need to manipulate the inferred
// schema.
return &FieldSchema{Required: !nullable, Type: NumericFieldType}, nil
case typeOfIntervalValue:
return &FieldSchema{Required: !nullable, Type: IntervalFieldType}, nil
case typeOfRangeValue:
// We can't fully infer the element type of a range without additional
// information, and don't set the RangeElementType when inferred.
return &FieldSchema{Required: !nullable, Type: RangeFieldType}, nil
}
if ft := nullableFieldType(rt); ft != "" {
return &FieldSchema{Required: false, Type: ft}, nil
}
if isSupportedIntType(rt) || isSupportedUintType(rt) {
return &FieldSchema{Required: true, Type: IntegerFieldType}, nil
}
switch rt.Kind() {
case reflect.Slice, reflect.Array:
et := rt.Elem()
if et != typeOfByteSlice && (et.Kind() == reflect.Slice || et.Kind() == reflect.Array) {
// Multi dimensional slices/arrays are not supported by BigQuery
return nil, unsupportedFieldTypeError{fieldName, rt}
}
if nullableFieldType(et) != "" {
// Repeated nullable types are not supported by BigQuery.
return nil, unsupportedFieldTypeError{fieldName, rt}
}
f, err := inferFieldSchema(fieldName, et, false, false)
if err != nil {
return nil, err
}
f.Repeated = true
f.Required = false
return f, nil
case reflect.Ptr:
if rt.Elem().Kind() != reflect.Struct {
return nil, unsupportedFieldTypeError{fieldName, rt}
}
fallthrough
case reflect.Struct:
if json {
return &FieldSchema{Required: !nullable, Type: JSONFieldType}, nil
}
nested, err := inferStruct(rt)
if err != nil {
return nil, err
}
return &FieldSchema{Required: !nullable, Type: RecordFieldType, Schema: nested}, nil
case reflect.String:
return &FieldSchema{Required: !nullable, Type: StringFieldType}, nil
case reflect.Bool:
return &FieldSchema{Required: !nullable, Type: BooleanFieldType}, nil
case reflect.Float32, reflect.Float64:
return &FieldSchema{Required: !nullable, Type: FloatFieldType}, nil
case reflect.Map:
if rt.Key().Kind() != reflect.String {
return nil, unsupportedFieldTypeError{fieldName, rt}
}
return &FieldSchema{Required: !nullable, Type: JSONFieldType}, nil
default:
return nil, unsupportedFieldTypeError{fieldName, rt}
}
}
// inferFields extracts all exported field types from struct type.
func inferFields(rt reflect.Type) (Schema, error) {
var s Schema
fields, err := fieldCache.Fields(rt)
if err != nil {
return nil, err
}
for _, field := range fields {
var nullable, json bool
for _, opt := range field.ParsedTag.([]string) {
if opt == nullableTagOption {
nullable = true
}
if opt == jsonTagOption {
json = true
}
}
f, err := inferFieldSchema(field.Name, field.Type, nullable, json)
if err != nil {
return nil, err
}
f.Name = field.Name
s = append(s, f)
}
return s, nil
}
// isSupportedIntType reports whether t is an int type that can be properly
// represented by the BigQuery INTEGER/INT64 type.
func isSupportedIntType(t reflect.Type) bool {
switch t.Kind() {
case reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64, reflect.Int:
return true
default:
return false
}
}
// isSupportedIntType reports whether t is a uint type that can be properly
// represented by the BigQuery INTEGER/INT64 type.
func isSupportedUintType(t reflect.Type) bool {
switch t.Kind() {
case reflect.Uint8, reflect.Uint16, reflect.Uint32:
return true
default:
return false
}
}
// typeList is a linked list of reflect.Types.
type typeList struct {
t reflect.Type
next *typeList
}
func (l *typeList) has(t reflect.Type) bool {
for l != nil {
if l.t == t {
return true
}
l = l.next
}
return false
}
// hasRecursiveType reports whether t or any type inside t refers to itself, directly or indirectly,
// via exported fields. (Schema inference ignores unexported fields.)
func hasRecursiveType(t reflect.Type, seen *typeList) (bool, error) {
for t.Kind() == reflect.Ptr || t.Kind() == reflect.Slice || t.Kind() == reflect.Array {
t = t.Elem()
}
if t.Kind() != reflect.Struct {
return false, nil
}
if seen.has(t) {
return true, nil
}
fields, err := fieldCache.Fields(t)
if err != nil {
return false, err
}
seen = &typeList{t, seen}
// Because seen is a linked list, additions to it from one field's
// recursive call will not affect the value for subsequent fields' calls.
for _, field := range fields {
ok, err := hasRecursiveType(field.Type, seen)
if err != nil {
return false, err
}
if ok {
return true, nil
}
}
return false, nil
}
// validateKnownType ensures a type is known (or alias of a known type).
func validateKnownType(in FieldType) (FieldType, error) {
if _, ok := fieldTypes[in]; !ok {
// not a defined type, check aliases.
if resolved, ok := fieldAliases[in]; ok {
return resolved, nil
}
return "", fmt.Errorf("unknown field type (%s)", in)
}
return in, nil
}
// SchemaFromJSON takes a native JSON BigQuery table schema definition and converts it to
// a populated Schema. The native API definition is used by tools such as the BQ CLI and
// https://github.com/GoogleCloudPlatform/protoc-gen-bq-schema.
//
// The expected format is a JSON array of TableFieldSchema objects from the underlying API:
// https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema
func SchemaFromJSON(schemaJSON []byte) (Schema, error) {
// Make sure we actually have some content:
if len(schemaJSON) == 0 {
return nil, errEmptyJSONSchema
}
var rawSchema []*bq.TableFieldSchema
if err := json.Unmarshal(schemaJSON, &rawSchema); err != nil {
return nil, err
}
convertedSchema := Schema{}
for _, f := range rawSchema {
convField := bqToFieldSchema(f)
// Normalize the types.
validType, err := validateKnownType(convField.Type)
if err != nil {
return nil, err
}
convField.Type = validType
convertedSchema = append(convertedSchema, convField)
}
return convertedSchema, nil
}
type noStructError struct {
typ reflect.Type
}
func (e noStructError) Error() string {
return fmt.Sprintf("bigquery: can only infer schema from struct or pointer to struct, not %s", e.typ)
}
type badNullableError struct {
name string
typ reflect.Type
}
func (e badNullableError) Error() string {
return fmt.Sprintf(`bigquery: field %q of type %s: use "nullable" only for []byte and struct pointers; for all other types, use a NullXXX type`, e.name, e.typ)
}
type badJSONError struct {
name string
typ reflect.Type
}
func (e badJSONError) Error() string {
return fmt.Sprintf(`bigquery: field %q of type %s: use "json" only for struct and struct pointers`, e.name, e.typ)
}
type unsupportedFieldTypeError struct {
name string
typ reflect.Type
}
func (e unsupportedFieldTypeError) Error() string {
return fmt.Sprintf("bigquery: field %q: type %s is not supported", e.name, e.typ)
}