bigquery: Move some options into the GCSReference struct.
This will smooth the path to:
(a) removing Client.Copy
(b) allowing GCSReference to act as an external data source.
Change-Id: I96adf053f4c74fdcb5a78715a193aacc0cf3de3e
Reviewed-on: https://code-review.googlesource.com/7892
Reviewed-by: Jonathan Amsterdam <jba@google.com>
diff --git a/bigquery/extract_test.go b/bigquery/extract_test.go
index b97c39e..1fd08fa 100644
--- a/bigquery/extract_test.go
+++ b/bigquery/extract_test.go
@@ -46,12 +46,12 @@
want *bq.Job
}{
{
- dst: defaultGCS,
+ dst: defaultGCS(),
src: defaultTable(nil),
want: defaultExtractJob(),
},
{
- dst: defaultGCS,
+ dst: defaultGCS(),
src: defaultTable(nil),
options: []Option{
DisableHeader(),
diff --git a/bigquery/gcs.go b/bigquery/gcs.go
index 923aeb4..f65cfc7 100644
--- a/bigquery/gcs.go
+++ b/bigquery/gcs.go
@@ -21,23 +21,40 @@
type GCSReference struct {
uris []string
- // FieldDelimiter is the separator for fields in a CSV file, used when loading or exporting data.
+ // FieldDelimiter is the separator for fields in a CSV file, used when reading or exporting data.
// The default is ",".
FieldDelimiter string
- // The number of rows at the top of a CSV file that BigQuery will skip when loading the data.
+ // The number of rows at the top of a CSV file that BigQuery will skip when reading data.
SkipLeadingRows int64
- // SourceFormat is the format of the GCS data to be loaded into BigQuery.
- // Allowed values are: CSV, JSON, DatastoreBackup. The default is CSV.
+ // SourceFormat is the format of the GCS data to be read.
+ // Allowed values are: CSV, Avro, JSON, DatastoreBackup. The default is CSV.
SourceFormat DataFormat
- // Only used when loading data.
+ // AllowJaggedRows causes missing trailing optional columns to be tolerated when reading CSV data. Missing values are treated as nulls.
+ AllowJaggedRows bool
+ // AllowQuotedNewlines sets whether quoted data sections containing newlines are allowed when reading CSV data.
+ AllowQuotedNewlines bool
+
+ // Encoding is the character encoding of data to be read.
Encoding Encoding
+ // MaxBadRecords is the maximum number of bad records that will be ignored when reading data.
+ MaxBadRecords int64
+
+ // IgnoreUnknownValues causes values not matching the schema to be tolerated.
+ // Unknown values are ignored. For CSV this ignores extra values at the end of a line.
+ // For JSON this ignores named values that do not match any column name.
+ // If this field is not set, records containing unknown values are treated as bad records.
+ // The MaxBadRecords field can be used to customize how bad records are handled.
+ IgnoreUnknownValues bool
+
+ // Schema describes the data. It is required when loading CSV or JSON data into a table unless the table already exists.
+ Schema Schema
// Quote is the value used to quote data sections in a CSV file.
// The default quotation character is the double quote ("), which is used if both Quote and ForceZeroQuote are unset.
// To specify that no character should be interpreted as a quotation character, set ForceZeroQuote to true.
- // Only used when loading data.
+ // Only used when reading data.
Quote string
ForceZeroQuote bool
@@ -45,7 +62,9 @@
// Allowed values are: CSV, Avro, JSON. The default is CSV.
// CSV is not supported for tables with nested or repeated fields.
DestinationFormat DataFormat
- // Only used when writing data. Default is None.
+
+ // Compression specifies the type of compression to apply when writing data to Google Cloud Storage.
+ // Default is None.
Compression Compression
}
@@ -93,8 +112,15 @@
conf.SourceUris = gcs.uris
conf.SkipLeadingRows = gcs.SkipLeadingRows
conf.SourceFormat = string(gcs.SourceFormat)
+ conf.AllowJaggedRows = gcs.AllowJaggedRows
+ conf.AllowQuotedNewlines = gcs.AllowQuotedNewlines
conf.Encoding = string(gcs.Encoding)
conf.FieldDelimiter = gcs.FieldDelimiter
+ conf.IgnoreUnknownValues = gcs.IgnoreUnknownValues
+ conf.MaxBadRecords = gcs.MaxBadRecords
+ if gcs.Schema != nil {
+ conf.Schema = gcs.Schema.asTableSchema()
+ }
if gcs.ForceZeroQuote {
quote := ""
diff --git a/bigquery/load_op.go b/bigquery/load_op.go
index 5d01e67..8ac3b22 100644
--- a/bigquery/load_op.go
+++ b/bigquery/load_op.go
@@ -29,6 +29,8 @@
// A DestinationSchema Option must be supplied when loading data from Google Cloud Storage into a non-existent table.
// Caveat: DestinationSchema is not required if the data being loaded is a datastore backup.
// schema must not be nil.
+//
+// Deprecated: use GCSReference.Schema instead.
func DestinationSchema(schema Schema) Option { return destSchema{Schema: schema} }
type destSchema struct {
@@ -43,6 +45,8 @@
// MaxBadRecords returns an Option that sets the maximum number of bad records that will be ignored.
// If this maximum is exceeded, the operation will be unsuccessful.
+//
+// Deprecated: use GCSReference.MaxBadRecords instead.
func MaxBadRecords(n int64) Option { return maxBadRecords(n) }
type maxBadRecords int64
@@ -54,6 +58,8 @@
}
// AllowJaggedRows returns an Option that causes missing trailing optional columns to be tolerated in CSV data. Missing values are treated as nulls.
+//
+// Deprecated: use GCSReference.AllowJaggedRows instead.
func AllowJaggedRows() Option { return allowJaggedRows{} }
type allowJaggedRows struct{}
@@ -65,6 +71,8 @@
}
// AllowQuotedNewlines returns an Option that allows quoted data sections containing newlines in CSV data.
+//
+// Deprecated: use GCSReference.AllowQuotedNewlines instead.
func AllowQuotedNewlines() Option { return allowQuotedNewlines{} }
type allowQuotedNewlines struct{}
@@ -80,6 +88,8 @@
// For JSON this ignores named values that do not match any column name.
// If this Option is not used, records containing unknown values are treated as bad records.
// The MaxBadRecords Option can be used to customize how bad records are handled.
+//
+// Deprecated: use GCSReference.IgnoreUnknownValues instead.
func IgnoreUnknownValues() Option { return ignoreUnknownValues{} }
type ignoreUnknownValues struct{}
diff --git a/bigquery/load_test.go b/bigquery/load_test.go
index 19482b9..572c9d2 100644
--- a/bigquery/load_test.go
+++ b/bigquery/load_test.go
@@ -74,12 +74,12 @@
}{
{
dst: defaultTable(nil),
- src: defaultGCS,
+ src: defaultGCS(),
want: defaultLoadJob(),
},
- {
+ { // old-style options relating to GCS data.
dst: defaultTable(nil),
- src: defaultGCS,
+ src: defaultGCS(),
options: []Option{
MaxBadRecords(1),
AllowJaggedRows(),
@@ -95,6 +95,25 @@
return j
}(),
},
+ { // same as above, but with the options set in GCSReference fields.
+ dst: defaultTable(nil),
+ src: func() *GCSReference {
+ g := defaultGCS()
+ g.MaxBadRecords = 1
+ g.AllowJaggedRows = true
+ g.AllowQuotedNewlines = true
+ g.IgnoreUnknownValues = true
+ return g
+ }(),
+ want: func() *bq.Job {
+ j := defaultLoadJob()
+ j.Configuration.Load.MaxBadRecords = 1
+ j.Configuration.Load.AllowJaggedRows = true
+ j.Configuration.Load.AllowQuotedNewlines = true
+ j.Configuration.Load.IgnoreUnknownValues = true
+ return j
+ }(),
+ },
{
dst: &Table{
ProjectID: "project-id",
@@ -102,7 +121,7 @@
TableID: "table-id",
},
options: []Option{CreateNever, WriteTruncate},
- src: defaultGCS,
+ src: defaultGCS(),
want: func() *bq.Job {
j := defaultLoadJob()
j.Configuration.Load.CreateDisposition = "CREATE_NEVER"
@@ -110,13 +129,13 @@
return j
}(),
},
- {
+ { // old-style option for schema
dst: &Table{
ProjectID: "project-id",
DatasetID: "dataset-id",
TableID: "table-id",
},
- src: defaultGCS,
+ src: defaultGCS(),
options: []Option{
DestinationSchema(Schema{
stringFieldSchema(),
@@ -133,6 +152,30 @@
return j
}(),
},
+ { // same as above, but with the schema set in GCSReference.
+ dst: &Table{
+ ProjectID: "project-id",
+ DatasetID: "dataset-id",
+ TableID: "table-id",
+ },
+ src: func() *GCSReference {
+ g := defaultGCS()
+ g.Schema = Schema{
+ stringFieldSchema(),
+ nestedFieldSchema(),
+ }
+ return g
+ }(),
+ want: func() *bq.Job {
+ j := defaultLoadJob()
+ j.Configuration.Load.Schema = &bq.TableSchema{
+ Fields: []*bq.TableFieldSchema{
+ bqStringFieldSchema(),
+ bqNestedFieldSchema(),
+ }}
+ return j
+ }(),
+ },
{
dst: defaultTable(nil),
src: &GCSReference{
diff --git a/bigquery/utils_test.go b/bigquery/utils_test.go
index 6411c5d..8b3aab9 100644
--- a/bigquery/utils_test.go
+++ b/bigquery/utils_test.go
@@ -19,8 +19,10 @@
bq "google.golang.org/api/bigquery/v2"
)
-var defaultGCS = &GCSReference{
- uris: []string{"uri"},
+func defaultGCS() *GCSReference {
+ return &GCSReference{
+ uris: []string{"uri"},
+ }
}
var defaultQuery = &Query{