bigquery: Move some options into the GCSReference struct. This will smooth the path to: (a) removing Client.Copy (b) allowing GCSReference to act as an external data source. Change-Id: I96adf053f4c74fdcb5a78715a193aacc0cf3de3e Reviewed-on: https://code-review.googlesource.com/7892 Reviewed-by: Jonathan Amsterdam <jba@google.com>

commit: 26070f1f45f79c489d86d7170632298ae627ba1a [log] [tgz]
author: Michael McGreevy <mcgreevy@golang.org> Tue Sep 27 17:18:27 2016 +1000
committer: Michael McGreevy <mcgreevy@golang.org> Fri Sep 30 04:54:21 2016 +0000
tree: 3b1dd9ac248d1aff7a4bf365c9bf2d89d7207422
parent: 2c87db0fa84b541e9f6513737402e6407574a844 [diff]
diff --git a/bigquery/extract_test.go b/bigquery/extract_test.go
index b97c39e..1fd08fa 100644
--- a/bigquery/extract_test.go
+++ b/bigquery/extract_test.go

@@ -46,12 +46,12 @@
 		want    *bq.Job
 	}{
 		{
-			dst:  defaultGCS,
+			dst:  defaultGCS(),
 			src:  defaultTable(nil),
 			want: defaultExtractJob(),
 		},
 		{
-			dst: defaultGCS,
+			dst: defaultGCS(),
 			src: defaultTable(nil),
 			options: []Option{
 				DisableHeader(),

diff --git a/bigquery/gcs.go b/bigquery/gcs.go
index 923aeb4..f65cfc7 100644
--- a/bigquery/gcs.go
+++ b/bigquery/gcs.go

@@ -21,23 +21,40 @@
 type GCSReference struct {
 	uris []string
 
-	// FieldDelimiter is the separator for fields in a CSV file, used when loading or exporting data.
+	// FieldDelimiter is the separator for fields in a CSV file, used when reading or exporting data.
 	// The default is ",".
 	FieldDelimiter string
 
-	// The number of rows at the top of a CSV file that BigQuery will skip when loading the data.
+	// The number of rows at the top of a CSV file that BigQuery will skip when reading data.
 	SkipLeadingRows int64
 
-	// SourceFormat is the format of the GCS data to be loaded into BigQuery.
-	// Allowed values are: CSV, JSON, DatastoreBackup.  The default is CSV.
+	// SourceFormat is the format of the GCS data to be read.
+	// Allowed values are: CSV, Avro, JSON, DatastoreBackup.  The default is CSV.
 	SourceFormat DataFormat
-	// Only used when loading data.
+	// AllowJaggedRows causes missing trailing optional columns to be tolerated when reading CSV data.  Missing values are treated as nulls.
+	AllowJaggedRows bool
+	// AllowQuotedNewlines sets whether quoted data sections containing newlines are allowed when reading CSV data.
+	AllowQuotedNewlines bool
+
+	// Encoding is the character encoding of data to be read.
 	Encoding Encoding
+	// MaxBadRecords is the maximum number of bad records that will be ignored when reading data.
+	MaxBadRecords int64
+
+	// IgnoreUnknownValues causes values not matching the schema to be tolerated.
+	// Unknown values are ignored. For CSV this ignores extra values at the end of a line.
+	// For JSON this ignores named values that do not match any column name.
+	// If this field is not set, records containing unknown values are treated as bad records.
+	// The MaxBadRecords field can be used to customize how bad records are handled.
+	IgnoreUnknownValues bool
+
+	// Schema describes the data. It is required when loading CSV or JSON data into a table unless the table already exists.
+	Schema Schema
 
 	// Quote is the value used to quote data sections in a CSV file.
 	// The default quotation character is the double quote ("), which is used if both Quote and ForceZeroQuote are unset.
 	// To specify that no character should be interpreted as a quotation character, set ForceZeroQuote to true.
-	// Only used when loading data.
+	// Only used when reading data.
 	Quote          string
 	ForceZeroQuote bool
 
@@ -45,7 +62,9 @@
 	// Allowed values are: CSV, Avro, JSON.  The default is CSV.
 	// CSV is not supported for tables with nested or repeated fields.
 	DestinationFormat DataFormat
-	// Only used when writing data.  Default is None.
+
+	// Compression specifies the type of compression to apply when writing data to Google Cloud Storage.
+	// Default is None.
 	Compression Compression
 }
 
@@ -93,8 +112,15 @@
 	conf.SourceUris = gcs.uris
 	conf.SkipLeadingRows = gcs.SkipLeadingRows
 	conf.SourceFormat = string(gcs.SourceFormat)
+	conf.AllowJaggedRows = gcs.AllowJaggedRows
+	conf.AllowQuotedNewlines = gcs.AllowQuotedNewlines
 	conf.Encoding = string(gcs.Encoding)
 	conf.FieldDelimiter = gcs.FieldDelimiter
+	conf.IgnoreUnknownValues = gcs.IgnoreUnknownValues
+	conf.MaxBadRecords = gcs.MaxBadRecords
+	if gcs.Schema != nil {
+		conf.Schema = gcs.Schema.asTableSchema()
+	}
 
 	if gcs.ForceZeroQuote {
 		quote := ""

diff --git a/bigquery/load_op.go b/bigquery/load_op.go
index 5d01e67..8ac3b22 100644
--- a/bigquery/load_op.go
+++ b/bigquery/load_op.go

@@ -29,6 +29,8 @@
 // A DestinationSchema Option must be supplied when loading data from Google Cloud Storage into a non-existent table.
 // Caveat: DestinationSchema is not required if the data being loaded is a datastore backup.
 // schema must not be nil.
+//
+// Deprecated: use GCSReference.Schema instead.
 func DestinationSchema(schema Schema) Option { return destSchema{Schema: schema} }
 
 type destSchema struct {
@@ -43,6 +45,8 @@
 
 // MaxBadRecords returns an Option that sets the maximum number of bad records that will be ignored.
 // If this maximum is exceeded, the operation will be unsuccessful.
+//
+// Deprecated: use GCSReference.MaxBadRecords instead.
 func MaxBadRecords(n int64) Option { return maxBadRecords(n) }
 
 type maxBadRecords int64
@@ -54,6 +58,8 @@
 }
 
 // AllowJaggedRows returns an Option that causes missing trailing optional columns to be tolerated in CSV data.  Missing values are treated as nulls.
+//
+// Deprecated: use GCSReference.AllowJaggedRows instead.
 func AllowJaggedRows() Option { return allowJaggedRows{} }
 
 type allowJaggedRows struct{}
@@ -65,6 +71,8 @@
 }
 
 // AllowQuotedNewlines returns an Option that allows quoted data sections containing newlines in CSV data.
+//
+// Deprecated: use GCSReference.AllowQuotedNewlines instead.
 func AllowQuotedNewlines() Option { return allowQuotedNewlines{} }
 
 type allowQuotedNewlines struct{}
@@ -80,6 +88,8 @@
 // For JSON this ignores named values that do not match any column name.
 // If this Option is not used, records containing unknown values are treated as bad records.
 // The MaxBadRecords Option can be used to customize how bad records are handled.
+//
+// Deprecated: use GCSReference.IgnoreUnknownValues instead.
 func IgnoreUnknownValues() Option { return ignoreUnknownValues{} }
 
 type ignoreUnknownValues struct{}

diff --git a/bigquery/load_test.go b/bigquery/load_test.go
index 19482b9..572c9d2 100644
--- a/bigquery/load_test.go
+++ b/bigquery/load_test.go

@@ -74,12 +74,12 @@
 	}{
 		{
 			dst:  defaultTable(nil),
-			src:  defaultGCS,
+			src:  defaultGCS(),
 			want: defaultLoadJob(),
 		},
-		{
+		{ // old-style options relating to GCS data.
 			dst: defaultTable(nil),
-			src: defaultGCS,
+			src: defaultGCS(),
 			options: []Option{
 				MaxBadRecords(1),
 				AllowJaggedRows(),
@@ -95,6 +95,25 @@
 				return j
 			}(),
 		},
+		{ // same as above, but with the options set in GCSReference fields.
+			dst: defaultTable(nil),
+			src: func() *GCSReference {
+				g := defaultGCS()
+				g.MaxBadRecords = 1
+				g.AllowJaggedRows = true
+				g.AllowQuotedNewlines = true
+				g.IgnoreUnknownValues = true
+				return g
+			}(),
+			want: func() *bq.Job {
+				j := defaultLoadJob()
+				j.Configuration.Load.MaxBadRecords = 1
+				j.Configuration.Load.AllowJaggedRows = true
+				j.Configuration.Load.AllowQuotedNewlines = true
+				j.Configuration.Load.IgnoreUnknownValues = true
+				return j
+			}(),
+		},
 		{
 			dst: &Table{
 				ProjectID: "project-id",
@@ -102,7 +121,7 @@
 				TableID:   "table-id",
 			},
 			options: []Option{CreateNever, WriteTruncate},
-			src:     defaultGCS,
+			src:     defaultGCS(),
 			want: func() *bq.Job {
 				j := defaultLoadJob()
 				j.Configuration.Load.CreateDisposition = "CREATE_NEVER"
@@ -110,13 +129,13 @@
 				return j
 			}(),
 		},
-		{
+		{ // old-style option for schema
 			dst: &Table{
 				ProjectID: "project-id",
 				DatasetID: "dataset-id",
 				TableID:   "table-id",
 			},
-			src: defaultGCS,
+			src: defaultGCS(),
 			options: []Option{
 				DestinationSchema(Schema{
 					stringFieldSchema(),
@@ -133,6 +152,30 @@
 				return j
 			}(),
 		},
+		{ // same as above, but with the schema set in GCSReference.
+			dst: &Table{
+				ProjectID: "project-id",
+				DatasetID: "dataset-id",
+				TableID:   "table-id",
+			},
+			src: func() *GCSReference {
+				g := defaultGCS()
+				g.Schema = Schema{
+					stringFieldSchema(),
+					nestedFieldSchema(),
+				}
+				return g
+			}(),
+			want: func() *bq.Job {
+				j := defaultLoadJob()
+				j.Configuration.Load.Schema = &bq.TableSchema{
+					Fields: []*bq.TableFieldSchema{
+						bqStringFieldSchema(),
+						bqNestedFieldSchema(),
+					}}
+				return j
+			}(),
+		},
 		{
 			dst: defaultTable(nil),
 			src: &GCSReference{

diff --git a/bigquery/utils_test.go b/bigquery/utils_test.go
index 6411c5d..8b3aab9 100644
--- a/bigquery/utils_test.go
+++ b/bigquery/utils_test.go

@@ -19,8 +19,10 @@
 	bq "google.golang.org/api/bigquery/v2"
 )
 
-var defaultGCS = &GCSReference{
-	uris: []string{"uri"},
+func defaultGCS() *GCSReference {
+	return &GCSReference{
+		uris: []string{"uri"},
+	}
 }
 
 var defaultQuery = &Query{
commit	26070f1f45f79c489d86d7170632298ae627ba1a	[log] [tgz]
author	Michael McGreevy <mcgreevy@golang.org>	Tue Sep 27 17:18:27 2016 +1000
committer	Michael McGreevy <mcgreevy@golang.org>	Fri Sep 30 04:54:21 2016 +0000
tree	3b1dd9ac248d1aff7a4bf365c9bf2d89d7207422
parent	2c87db0fa84b541e9f6513737402e6407574a844 [diff]