tensorflow/python/ops/nn_impl.py - third_party/github.com/tensorflow/tensorflow - Git at Google

 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # =============================================================================
 """Implementation of Neural Net (NN) functions."""

 import math

 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import cond as tf_cond
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import gen_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import util as losses_util
 from tensorflow.python.platform import device_context
 from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export


 @tf_export("nn.log_poisson_loss")
 @dispatch.add_dispatch_support
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.

   Gives the log-likelihood loss between the prediction and the target under the
   assumption that the target has a Poisson distribution.
   Caveat: By default, this is not the exact loss, but the loss minus a
     constant term [log(z!)]. That has no effect for optimization, but
     does not play well with relative loss comparisons. To compute an
     approximation of the log factorial term, specify
     compute_full_loss=True to enable Stirling's Approximation.

   For brevity, let `c = log(x) = log_input`, `z = targets`.  The log Poisson
   loss is

         -log(exp(-x) * (x^z) / z!)
       = -log(exp(-x) * (x^z)) + log(z!)
       ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
           [ Note the second term is the Stirling's Approximation for log(z!).
             It is invariant to x and does not affect optimization, though
             important for correct relative loss comparisons. It is only
             computed when compute_full_loss == True. ]
       = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)]
       = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)]

   Args:
     targets: A `Tensor` of the same type and shape as `log_input`.
     log_input: A `Tensor` of type `float32` or `float64`.
     compute_full_loss: whether to compute the full loss. If false, a constant
       term is dropped in favor of more efficient optimization.
     name: A name for the operation (optional).

   Returns:
     A `Tensor` of the same shape as `log_input` with the componentwise
     logistic losses.

   Raises:
     ValueError: If `log_input` and `targets` do not have the same shape.
   """
   with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name:
     log_input = ops.convert_to_tensor(log_input, name="log_input")
     targets = ops.convert_to_tensor(targets, name="targets")
     try:
       targets.get_shape().assert_is_compatible_with(log_input.get_shape())
     except ValueError:
       raise ValueError(
           "`log_input` and `targets` must have the same shape, received "
           f"({log_input.get_shape()} vs {targets.get_shape()}).")

     result = math_ops.exp(log_input) - log_input * targets
     if compute_full_loss:
       # need to create constant tensors here so that their dtypes can be matched
       # to that of the targets.
       point_five = constant_op.constant(0.5, dtype=targets.dtype)
       two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype)

       stirling_approx = (targets * math_ops.log(targets)) - targets + (
           point_five * math_ops.log(two_pi * targets))
       zeros = array_ops.zeros_like(targets, dtype=targets.dtype)
       ones = array_ops.ones_like(targets, dtype=targets.dtype)
       cond = math_ops.logical_and(targets >= zeros, targets <= ones)
       result += array_ops.where(cond, zeros, stirling_approx)
     return result


 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
 @dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits(
     labels=None,
     logits=None,
     name=None):
   """See sigmoid_cross_entropy_with_logits_v2."""
   # pylint: disable=protected-access
   nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", labels, logits)
   # pylint: enable=protected-access

   with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
     try:
       labels.get_shape().assert_is_compatible_with(logits.get_shape())
     except ValueError:
       raise ValueError("`logits` and `labels` must have the same shape, "
                        f"received ({logits.get_shape()} vs "
                        f"{labels.get_shape()}).")

     # The logistic loss formula from above is
     #   x - x * z + log(1 + exp(-x))
     # For x < 0, a more numerically stable formula is
     #   -x * z + log(1 + exp(x))
     # Note that these two expressions can be combined into the following:
     #   max(x, 0) - x * z + log(1 + exp(-abs(x)))
     # To allow computing gradients at zero, we define custom versions of max and
     # abs functions.
     zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
     cond = (logits >= zeros)
     relu_logits = array_ops.where(cond, logits, zeros)
     neg_abs_logits = array_ops.where(cond, -logits, logits)  # pylint: disable=invalid-unary-operand-type
     return math_ops.add(
         relu_logits - logits * labels,
         math_ops.log1p(math_ops.exp(neg_abs_logits)),
         name=name)


 # Note: intentionally calling this v2 to not allow existing code with indirect
 # imports to ignore the sentinel behavior.
 @tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
 @dispatch.register_binary_elementwise_api
 @dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
     labels=None,
     logits=None,
     name=None):
   r"""Computes sigmoid cross entropy given `logits`.

   Measures the probability error in tasks with two outcomes in which each
   outcome is independent and need not have a fully certain label. For instance,
   one could perform a regression where the probability of an event happening is
   known and used as a label. This loss may also be used for binary
   classification, where labels are either zero or one.

   For brevity, let `x = logits`, `z = labels`.  The logistic loss is

         z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
       = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
       = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
       = (1 - z) * x + log(1 + exp(-x))
       = x - x * z + log(1 + exp(-x))

   For x < 0, to avoid overflow in exp(-x), we reformulate the above

         x - x * z + log(1 + exp(-x))
       = log(exp(x)) - x * z + log(1 + exp(-x))
       = - x * z + log(1 + exp(x))

   Hence, to ensure stability and avoid overflow, the implementation uses this
   equivalent formulation

       max(x, 0) - x * z + log(1 + exp(-abs(x)))

   `logits` and `labels` must have the same type and shape.

   >>> logits = tf.constant([1., -1., 0., 1., -1., 0., 0.])
   >>> labels = tf.constant([0., 0., 0., 1., 1., 1., 0.5])
   >>> tf.nn.sigmoid_cross_entropy_with_logits(
   ...     labels=labels, logits=logits).numpy()
   array([1.3132617, 0.3132617, 0.6931472, 0.3132617, 1.3132617, 0.6931472,
          0.6931472], dtype=float32)

   Compared to the losses which handle multiple outcomes,
   `tf.nn.softmax_cross_entropy_with_logits` for general multi-class
   classification and `tf.nn.sparse_softmax_cross_entropy_with_logits` for more
   efficient multi-class classification with hard labels,
   `sigmoid_cross_entropy_with_logits` is a slight simplification for binary
   classification:

         sigmoid(x) = softmax([x, 0])[0]

   $$\frac{1}{1 + e^{-x}} = \frac{e^x}{e^x + e^0}$$

   While `sigmoid_cross_entropy_with_logits` works for soft binary labels
   (probabilities between 0 and 1), it can also be used for binary classification
   where the labels are hard. There is an equivalence between all three symbols
   in this case, with a probability 0 indicating the second class or 1 indicating
   the first class:

   >>> sigmoid_logits = tf.constant([1., -1., 0.])
   >>> softmax_logits = tf.stack([sigmoid_logits, tf.zeros_like(sigmoid_logits)],
   ...                           axis=-1)
   >>> soft_binary_labels = tf.constant([1., 1., 0.])
   >>> soft_multiclass_labels = tf.stack(
   ...     [soft_binary_labels, 1. - soft_binary_labels], axis=-1)
   >>> hard_labels = tf.constant([0, 0, 1])
   >>> tf.nn.sparse_softmax_cross_entropy_with_logits(
   ...     labels=hard_labels, logits=softmax_logits).numpy()
   array([0.31326166, 1.3132616 , 0.6931472 ], dtype=float32)
   >>> tf.nn.softmax_cross_entropy_with_logits(
   ...     labels=soft_multiclass_labels, logits=softmax_logits).numpy()
   array([0.31326166, 1.3132616, 0.6931472], dtype=float32)
   >>> tf.nn.sigmoid_cross_entropy_with_logits(
   ...     labels=soft_binary_labels, logits=sigmoid_logits).numpy()
   array([0.31326166, 1.3132616, 0.6931472], dtype=float32)

   Args:
     labels: A `Tensor` of the same type and shape as `logits`. Between 0 and 1,
       inclusive.
     logits: A `Tensor` of type `float32` or `float64`. Any real number.
     name: A name for the operation (optional).

   Returns:
     A `Tensor` of the same shape as `logits` with the componentwise
     logistic losses.

   Raises:
     ValueError: If `logits` and `labels` do not have the same shape.
   """
   return sigmoid_cross_entropy_with_logits(
       logits=logits, labels=labels, name=name)


 sigmoid_cross_entropy_with_logits.__doc__ = (
     sigmoid_cross_entropy_with_logits_v2.__doc__)


 @tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
 @dispatch.add_dispatch_support
 def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
                                           name=None):
   """Computes a weighted cross entropy.

   This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
   allows one to trade off recall and precision by up- or down-weighting the
   cost of a positive error relative to a negative error.

   The usual cross-entropy cost is defined as:

       labels * -log(sigmoid(logits)) +
           (1 - labels) * -log(1 - sigmoid(logits))

   A value `pos_weight > 1` decreases the false negative count, hence increasing
   the recall.
   Conversely setting `pos_weight < 1` decreases the false positive count and
   increases the precision.
   This can be seen from the fact that `pos_weight` is introduced as a
   multiplicative coefficient for the positive labels term
   in the loss expression:

       labels * -log(sigmoid(logits)) * pos_weight +
           (1 - labels) * -log(1 - sigmoid(logits))

   For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
   The loss is:

         qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
       = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
       = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
       = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
       = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

   Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
   the implementation uses

       (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

   `logits` and `labels` must have the same type and shape.

   >>> labels = tf.constant([1., 0.5, 0.])
   >>> logits = tf.constant([1.5, -0.1, -10.])
   >>> tf.nn.weighted_cross_entropy_with_logits(
   ...     labels=labels, logits=logits, pos_weight=tf.constant(1.5)).numpy()
   array([3.0211994e-01, 8.8049585e-01, 4.5776367e-05], dtype=float32)
   >>> tf.nn.weighted_cross_entropy_with_logits(
   ...     labels=labels, logits=logits, pos_weight=tf.constant(0.5)).numpy()
   array([1.00706644e-01, 5.08297503e-01, 4.57763672e-05], dtype=float32)

   Args:
     labels: A `Tensor` of the same type and shape as `logits`, with values
       between 0 and 1 inclusive.
     logits: A `Tensor` of type `float32` or `float64`, any real numbers.
     pos_weight: A coefficient to use on the positive examples, typically a
       scalar but otherwise broadcastable to the shape of `logits`. Its value
       should be non-negative.
     name: A name for the operation (optional).

   Returns:
     A `Tensor` of the same shape as `logits` with the componentwise
     weighted logistic losses.

   Raises:
     ValueError: If `logits` and `labels` do not have the same shape.
   """
   with ops.name_scope(name, "logistic_loss", [logits, labels]) as name:
     logits = ops.convert_to_tensor(logits, name="logits")
     labels = ops.convert_to_tensor(labels, name="labels")
     try:
       labels.get_shape().assert_is_compatible_with(logits.get_shape())
     except ValueError:
       raise ValueError("`logits` and `labels` must have the same shape, "
                        f"received ({logits.get_shape()} vs "
                        f"{labels.get_shape()}).")

     # The logistic loss formula from above is
     #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))
     # For x < 0, a more numerically stable formula is
     #   (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(x)) - l * x
     # To avoid branching, we use the combined version
     #   (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
     log_weight = 1 + (pos_weight - 1) * labels
     return math_ops.add(
         (1 - labels) * logits,
         log_weight * (math_ops.log1p(math_ops.exp(-math_ops.abs(logits))) +
                       nn_ops.relu(-logits)),  # pylint: disable=invalid-unary-operand-type
         name=name)


 @tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
 @dispatch.add_dispatch_support
 @deprecated_args(None, "targets is deprecated, use labels instead", "targets")
 def weighted_cross_entropy_with_logits(labels=None,
                                        logits=None,
                                        pos_weight=None,
                                        name=None,
                                        targets=None):
   """Computes a weighted cross entropy.

   This is like `sigmoid_cross_entropy_with_logits()` except that `pos_weight`,
   allows one to trade off recall and precision by up- or down-weighting the
   cost of a positive error relative to a negative error.

   The usual cross-entropy cost is defined as:

       labels * -log(sigmoid(logits)) +
           (1 - labels) * -log(1 - sigmoid(logits))

   A value `pos_weight > 1` decreases the false negative count, hence increasing
   the recall.
   Conversely setting `pos_weight < 1` decreases the false positive count and
   increases the precision.
   This can be seen from the fact that `pos_weight` is introduced as a
   multiplicative coefficient for the positive labels term
   in the loss expression:

       labels * -log(sigmoid(logits)) * pos_weight +
           (1 - labels) * -log(1 - sigmoid(logits))

   For brevity, let `x = logits`, `z = labels`, `q = pos_weight`.
   The loss is:

         qz * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
       = qz * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
       = qz * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
       = qz * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
       = (1 - z) * x + (qz +  1 - z) * log(1 + exp(-x))
       = (1 - z) * x + (1 + (q - 1) * z) * log(1 + exp(-x))

   Setting `l = (1 + (q - 1) * z)`, to ensure stability and avoid overflow,
   the implementation uses

       (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))

   `logits` and `labels` must have the same type and shape.

   Args:
     labels: A `Tensor` of the same type and shape as `logits`.
     logits: A `Tensor` of type `float32` or `float64`.
     pos_weight: A coefficient to use on the positive examples.
     name: A name for the operation (optional).
     targets: Deprecated alias for labels.

   Returns:
     A `Tensor` of the same shape as `logits` with the componentwise
     weighted logistic losses.

   Raises:
     ValueError: If `logits` and `labels` do not have the same shape.
   """
   labels = deprecated_argument_lookup("labels", labels, "targets", targets)
   return weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight, name)


 @tf_export("nn.compute_average_loss")
 @dispatch.add_dispatch_support
 def compute_average_loss(per_example_loss,
                          sample_weight=None,
                          global_batch_size=None):
   """Scales per-example losses with sample_weights and computes their average.

   Usage with distribution strategy and custom training loop:

   ```python
   with strategy.scope():
     def compute_loss(labels, predictions, sample_weight=None):

       # If you are using a `Loss` class instead, set reduction to `NONE` so that
       # we can do the reduction afterwards and divide by global batch size.
       per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(
           labels, predictions)

       # Compute loss that is scaled by sample_weight and by global batch size.
       return tf.nn.compute_average_loss(
           per_example_loss,
           sample_weight=sample_weight,
           global_batch_size=GLOBAL_BATCH_SIZE)
   ```

   Args:
     per_example_loss: Per-example loss.
     sample_weight: Optional weighting for each example.
     global_batch_size: Optional global batch size value. Defaults to (size of
       first dimension of `losses`) * (number of replicas).

   Returns:
     Scalar loss value, obtained by summing the `per_example_loss` and dividing
     by `global_batch_size`. If `global_batch_size` is zero, the result is zero.
   """  # pylint: disable=g-doc-exception
   per_example_loss = ops.convert_to_tensor(per_example_loss)
   input_dtype = per_example_loss.dtype

   with losses_util.check_per_example_loss_rank(per_example_loss):
     if sample_weight is not None:
       sample_weight = ops.convert_to_tensor(sample_weight)
       per_example_loss = losses_util.scale_losses_by_sample_weight(
           per_example_loss, sample_weight)
     per_example_loss = math_ops.cast(per_example_loss, input_dtype)

     if global_batch_size is None:
       if (distribute_lib.has_strategy()
           and distribute_lib.in_cross_replica_context()):
         raise RuntimeError(
             "You are calling `compute_average_loss` in cross replica context, "
             "while it was expected to be called in replica context.")

       num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
       per_replica_batch_size = array_ops.shape_v2(per_example_loss)[0]
       global_batch_size = per_replica_batch_size * num_replicas

     check_ops.assert_scalar_v2(
         global_batch_size, message="global_batch_size must be scalar.")
     check_ops.assert_integer_v2(
         global_batch_size,
         message="global_batch_size must be an integer.")
     check_ops.assert_non_negative_v2(
         global_batch_size, message="global_batch_size must be non-negative.")

     loss = math_ops.reduce_sum(per_example_loss)
     global_batch_size = math_ops.cast(global_batch_size, input_dtype)
     return math_ops.div_no_nan(loss, global_batch_size)


 @tf_export("nn.scale_regularization_loss")
 @dispatch.add_dispatch_support
 def scale_regularization_loss(regularization_loss):
   """Scales the sum of the given regularization losses by number of replicas.

   Usage with distribution strategy and custom training loop:

   ```python
   with strategy.scope():
     def compute_loss(self, label, predictions):
       per_example_loss = tf.keras.losses.sparse_categorical_crossentropy(
           labels, predictions)

       # Compute loss that is scaled by sample_weight and by global batch size.
       loss = tf.nn.compute_average_loss(
           per_example_loss,
           sample_weight=sample_weight,
           global_batch_size=GLOBAL_BATCH_SIZE)

       # Add scaled regularization losses.
       loss += tf.nn.scale_regularization_loss(tf.nn.l2_loss(weights))
       return loss
   ```

   Args:
     regularization_loss: Regularization loss.

   Returns:
     Scalar loss value.
   """  # pylint: disable=g-doc-exception
   if (distribute_lib.has_strategy()
       and distribute_lib.in_cross_replica_context()):
     raise RuntimeError(
         "You are calling `scale_regularization_loss` in cross replica context, "
         "while it was expected to be called in replica context.")

   num_replicas = distribute_lib.get_strategy().num_replicas_in_sync
   return math_ops.reduce_sum(regularization_loss) / num_replicas


 @tf_export(v1=["nn.relu_layer"])
 @dispatch.add_dispatch_support
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).

   Args:
     x: a 2D tensor.  Dimensions typically: batch, in_units
     weights: a 2D tensor.  Dimensions typically: in_units, out_units
     biases: a 1D tensor.  Dimensions: out_units
     name: A name for the operation (optional).  If not specified
       "nn_relu_layer" is used.

   Returns:
     A 2-D Tensor computing relu(matmul(x, weights) + biases).
     Dimensions typically: batch, out_units.
   """
   with ops.name_scope(name, "relu_layer", [x, weights, biases]) as name:
     x = ops.convert_to_tensor(x, name="x")
     weights = ops.convert_to_tensor(weights, name="weights")
     biases = ops.convert_to_tensor(biases, name="biases")
     xw_plus_b = nn_ops.bias_add(math_ops.matmul(x, weights), biases)
     return nn_ops.relu(xw_plus_b, name=name)


 @tf_export("nn.silu", "nn.swish")
 @dispatch.register_unary_elementwise_api
 @dispatch.add_dispatch_support
 def swish(features, beta=1.0):
   # pylint: disable=g-doc-args
   """Computes the SiLU or Swish activation function: `x * sigmoid(beta * x)`.

   beta : Hyperparameter for Swish activation function. Default value 1.0.

   The SiLU activation function was introduced in "Gaussian Error Linear Units
   (GELUs)" [Hendrycks et al. 2016](https://arxiv.org/abs/1606.08415) and
   "Sigmoid-Weighted Linear Units for Neural Network Function Approximation in
   Reinforcement Learning"
   [Elfwing et al. 2017](https://arxiv.org/abs/1702.03118) and was independently
   discovered (and called swish) in "Searching for Activation Functions"
   [Ramachandran et al. 2017](https://arxiv.org/abs/1710.05941)

   Args:
     features: A `Tensor` representing preactivation values.
     beta: A 'Tensor' representing value of beta hyperparameter.

   Returns:
     The activation value.
   """
   # pylint: enable=g-doc-args
   features = ops.convert_to_tensor(features, name="features")
   beta = ops.convert_to_tensor(beta, name="beta")
   beta = math_ops.cast(beta, features.dtype)

   @custom_gradient.custom_gradient
   def swish_impl(features, beta):

     def grad(dy):
       """Gradient for the Swish activation function."""
       # Naively, x * tf.nn.sigmoid(x) requires keeping both x and sigmoid(x)
       # around for backprop, effectively doubling the tensor's memory
       # consumption. We use a control dependency here so that sigmoid(features)
       # is re-computed during backprop (the control dep prevents it being
       # de-duped with the forward pass) and we can free the sigmoid(features)
       # expression immediately after use during the forward pass.
       with ops.control_dependencies([dy]):
         sigmoid_features = math_ops.sigmoid(beta * features)

       activation_grad = (
           sigmoid_features * (1.0 + (beta * features) *
                               (1.0 - sigmoid_features)))
       beta_grad = math_ops.reduce_sum(
           dy * math_ops.square(features) * sigmoid_features *
           (1.0 - sigmoid_features))
       return (dy * activation_grad, beta_grad)

     return features * math_ops.sigmoid(beta * features), grad

   return swish_impl(features, beta)


 # pylint: disable=redefined-builtin
 @tf_export("linalg.normalize")
 @dispatch.add_dispatch_support
 def normalize(tensor, ord="euclidean", axis=None, name=None):
   """Normalizes `tensor` along dimension `axis` using specified norm.

   This uses `tf.linalg.norm` to compute the norm along `axis`.

   This function can compute several different vector norms (the 1-norm, the
   Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
   matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).

   Args:
     tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
     ord: Order of the norm. Supported values are `'fro'`, `'euclidean'`, `1`,
       `2`, `np.inf` and any positive real number yielding the corresponding
       p-norm. Default is `'euclidean'` which is equivalent to Frobenius norm if
       `tensor` is a matrix and equivalent to 2-norm for vectors.
       Some restrictions apply: a) The Frobenius norm `'fro'` is not defined for
         vectors, b) If axis is a 2-tuple (matrix norm), only `'euclidean'`,
         '`fro'`, `1`, `2`, `np.inf` are supported. See the description of `axis`
         on how to compute norms for a batch of vectors or matrices stored in a
         tensor.
     axis: If `axis` is `None` (the default), the input is considered a vector
       and a single vector norm is computed over the entire set of values in the
       tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
       `norm(reshape(tensor, [-1]), ord=ord)`. If `axis` is a Python integer, the
       input is considered a batch of vectors, and `axis` determines the axis in
       `tensor` over which to compute vector norms. If `axis` is a 2-tuple of
       Python integers it is considered a batch of matrices and `axis` determines
       the axes in `tensor` over which to compute a matrix norm.
       Negative indices are supported. Example: If you are passing a tensor that
         can be either a matrix or a batch of matrices at runtime, pass
         `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
         computed.
     name: The name of the op.

   Returns:
     normalized: A normalized `Tensor` with the same shape as `tensor`.
     norm: The computed norms with the same shape and dtype `tensor` but the
       final axis is 1 instead. Same as running
       `tf.cast(tf.linalg.norm(tensor, ord, axis keepdims=True), tensor.dtype)`.

   Raises:
     ValueError: If `ord` or `axis` is invalid.
   """
   with ops.name_scope(name, "normalize", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor)
     norm = linalg_ops.norm(tensor, ord, axis, keepdims=True)
     norm = math_ops.cast(norm, tensor.dtype)
     normalized = tensor / norm
     return normalized, norm


 @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize",
            v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
 @dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.

   For a 1-D tensor with `axis = 0`, computes

       output = x / sqrt(max(sum(x**2), epsilon))

   For `x` with more dimensions, independently normalizes each 1-D slice along
   dimension `axis`.

   1-D tensor example:
   >>> x = tf.constant([3.0, 4.0])
   >>> tf.math.l2_normalize(x).numpy()
   array([0.6, 0.8], dtype=float32)

   2-D tensor example:
   >>> x = tf.constant([[3.0], [4.0]])
   >>> tf.math.l2_normalize(x, 0).numpy()
   array([[0.6],
        [0.8]], dtype=float32)

   >>> x = tf.constant([[3.0], [4.0]])
   >>> tf.math.l2_normalize(x, 1).numpy()
   array([[1.],
        [1.]], dtype=float32)

   Args:
     x: A `Tensor`.
     axis: Dimension along which to normalize.  A scalar or a vector of
       integers.
     epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
       divisor if `norm < sqrt(epsilon)`.
     name: A name for this operation (optional).
     dim: Deprecated, do not use.

   Returns:
     A `Tensor` with the same shape as `x`.
   """
   axis = deprecated_argument_lookup("axis", axis, "dim", dim)
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
     if x.dtype.is_complex:
       square_real = math_ops.square(math_ops.real(x))
       square_imag = math_ops.square(math_ops.imag(x))
       square_sum = math_ops.real(
           math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))
       x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
       norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)
       norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)
       return math_ops.complex(norm_real, norm_imag, name=name)
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)


 def _count_nonzero(input_tensor, dtype=dtypes.int64):
   """Same as math_ops.count_nonzero.

   The reduction is done in dtype, which can be faster for 32-bit dtypes.

   Args:
       input_tensor: numeric tensor
       dtype: reduction dtype

   Returns:
       number of nonzero values with type dtype
   """
   with ops.name_scope("count_nonzero", values=[input_tensor]):
     zero = array_ops.zeros([], dtype=input_tensor.dtype)
     nonzero_count = math_ops.reduce_sum(
         math_ops.cast(
             math_ops.not_equal(input_tensor, zero),
             dtype=dtype), name="nonzero_count")
     return nonzero_count


 @tf_export("math.zero_fraction", "nn.zero_fraction")
 @dispatch.add_dispatch_support
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.

   If `value` is empty, the result is `nan`.

   This is useful in summaries to measure and report sparsity.  For example,

   ```python
       z = tf.nn.relu(...)
       summ = tf.compat.v1.summary.scalar('sparsity', tf.nn.zero_fraction(z))
   ```

   Args:
     value: A tensor of numeric type.
     name: A name for the operation (optional).

   Returns:
     The fraction of zeros in `value`, with type `float32`.
   """
   with ops.name_scope(name, "zero_fraction", [value]):
     value = ops.convert_to_tensor(value, name="value")
     size = array_ops.size(value, out_type=dtypes.int64)
     # If the count is small, we can save memory/CPU with an int32 reduction.
     num_nonzero = tf_cond.cond(
         size <= dtypes.int32.max,
         # pylint: disable=g-long-lambda
         true_fn=lambda: math_ops.cast(
             _count_nonzero(value, dtype=dtypes.int32),
             dtype=dtypes.int64),
         false_fn=lambda: _count_nonzero(value, dtype=dtypes.int64))

     with ops.name_scope("counts_to_fraction"):
       num_zero = size - num_nonzero
       num_zero_float32 = math_ops.cast(num_zero, dtype=dtypes.float32)
       size_float32 = math_ops.cast(size, dtype=dtypes.float32)
       zero_fraction_float32 = num_zero_float32 / size_float32

     return array_ops.identity(zero_fraction_float32, "fraction")


 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
 @dispatch.add_dispatch_support
 def depthwise_conv2d(input,
                      filter,
                      strides,
                      padding,
                      rate=None,
                      name=None,
                      data_format=None,
                      dilations=None):
   """Depthwise 2-D convolution.

   Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
   and a filter tensor of shape
   `[filter_height, filter_width, in_channels, channel_multiplier]`
   containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
   applies a different filter to each input channel (expanding from 1 channel
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.

   In detail, with the default NHWC format,

       output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
            filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
                                            strides[2] * j + rate[1] * dj, k]

   Must have `strides[0] = strides[3] = 1`.  For the most common case of the
   same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
   If any value in `rate` is greater than 1, we perform atrous depthwise
   convolution, in which case all values in the `strides` tensor must be equal
   to 1.

   Usage Example:

   >>> x = np.array([
   ...     [1., 2.],
   ...     [3., 4.],
   ...     [5., 6.]
   ... ], dtype=np.float32).reshape((1, 3, 2, 1))
   >>> kernel = np.array([
   ...     [1., 2.],
   ...     [3., 4]
   ... ], dtype=np.float32).reshape((2, 1, 1, 2))
   >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
   ...                                  padding='VALID').numpy()
     array([[[[10., 14.],
              [14., 20.]],
             [[18., 26.],
              [22., 32.]]]], dtype=float32)

   >>> tf.compat.v1.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
   ...                                  padding=[[0, 0], [1, 0], [1, 0], [0, 0]]
   ...                                 ).numpy()
     array([[[[ 0.,  0.],
              [ 3.,  4.],
              [ 6.,  8.]],
             [[ 0.,  0.],
              [10., 14.],
              [14., 20.]],
             [[ 0.,  0.],
              [18., 26.],
              [22., 32.]]]], dtype=float32)

   Args:
     input: 4-D with shape according to `data_format`.
     filter: 4-D with shape
       `[filter_height, filter_width, in_channels, channel_multiplier]`.
     strides: 1-D of size 4.  The stride of the sliding window for each
       dimension of `input`.
     padding: Controls how to pad the image before applying the convolution. Can
       be the string `"SAME"` or `"VALID"` indicating the type of padding
       algorithm to use, or a list indicating the explicit paddings at the start
       and end of each dimension. When explicit padding is used and data_format
       is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
       [pad_left, pad_right], [0, 0]]`. When explicit padding used and
       data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
       [pad_top, pad_bottom], [pad_left, pad_right]]`.
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
     dilations: Alias of rate.

   Returns:
     A 4-D `Tensor` with shape according to `data_format`.  E.g., for
     "NHWC" format, shape is
     `[batch, out_height, out_width, in_channels * channel_multiplier].`
   """
   rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "depthwise", [input, filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
     filter = ops.convert_to_tensor(filter, name="filter_in")
     if rate is None:
       rate = [1, 1]

     # Use depthwise_conv2d_native if executing on TPU.
     if device_context.enclosing_tpu_context() is not None:
       if data_format == "NCHW":
         dilations = [1, 1, rate[0], rate[1]]
       else:
         dilations = [1, rate[0], rate[1], 1]
       return nn_ops.depthwise_conv2d_native(
           input=input,
           filter=filter,
           strides=strides,
           padding=padding,
           data_format=data_format,
           dilations=dilations,
           name=name)

     def op(input_converted, _, padding):
       return nn_ops.depthwise_conv2d_native(
           input=input_converted,
           filter=filter,
           strides=strides,
           padding=padding,
           data_format=data_format,
           name=name)

     return nn_ops.with_space_to_batch(
         input=input,
         filter_shape=array_ops.shape(filter),
         dilation_rate=rate,
         padding=padding,
         data_format=data_format,
         op=op)


 @tf_export("nn.depthwise_conv2d", v1=[])
 @dispatch.add_dispatch_support
 def depthwise_conv2d_v2(input,
                         filter,
                         strides,
                         padding,
                         data_format=None,
                         dilations=None,
                         name=None):
   """Depthwise 2-D convolution.

   Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
   and a filter tensor of shape
   `[filter_height, filter_width, in_channels, channel_multiplier]`
   containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
   applies a different filter to each input channel (expanding from 1 channel
   to `channel_multiplier` channels for each), then concatenates the results
   together.  The output has `in_channels * channel_multiplier` channels.

   In detail, with the default NHWC format,

       output[b, i, j, k * channel_multiplier + q] =
           sum_{di, dj} filter[di, dj, k, q] *
                        input[b, strides[1] * i + dilations[0] * di,
                                 strides[2] * j + dilations[1] * dj, k]

   Must have `strides[0] = strides[3] = 1`.  For the most common case of the
   same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
   If any value in `dilations` is greater than 1, we perform atrous depthwise
   convolution, in which case all values in the `strides` tensor must be equal
   to 1.

   Usage Example:

   >>> x = np.array([
   ...     [1., 2.],
   ...     [3., 4.],
   ...     [5., 6.]
   ... ], dtype=np.float32).reshape((1, 3, 2, 1))
   >>> kernel = np.array([
   ...     [1., 2.],
   ...     [3., 4]
   ... ], dtype=np.float32).reshape((2, 1, 1, 2))
   >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
   ...                        padding='VALID').numpy()
     array([[[[10., 14.],
              [14., 20.]],
             [[18., 26.],
              [22., 32.]]]], dtype=float32)

   >>> tf.nn.depthwise_conv2d(x, kernel, strides=[1, 1, 1, 1],
   ...                        padding=[[0, 0], [1, 0], [1, 0], [0, 0]]).numpy()
     array([[[[ 0.,  0.],
              [ 3.,  4.],
              [ 6.,  8.]],
             [[ 0.,  0.],
              [10., 14.],
              [14., 20.]],
             [[ 0.,  0.],
              [18., 26.],
              [22., 32.]]]], dtype=float32)

   Args:
     input: 4-D with shape according to `data_format`.
     filter: 4-D with shape
       `[filter_height, filter_width, in_channels, channel_multiplier]`.
     strides: 1-D of size 4.  The stride of the sliding window for each
       dimension of `input`.
     padding: Controls how to pad the image before applying the convolution. Can
       be the string `"SAME"` or `"VALID"` indicating the type of padding
       algorithm to use, or a list indicating the explicit paddings at the start
       and end of each dimension. See
       [here](https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2)
       for more information. When explicit padding is used and data_format
       is `"NHWC"`, this should be in the form `[[0, 0], [pad_top, pad_bottom],
       [pad_left, pad_right], [0, 0]]`. When explicit padding used and
       data_format is `"NCHW"`, this should be in the form `[[0, 0], [0, 0],
       [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
     dilations: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).

   Returns:
     A 4-D `Tensor` with shape according to `data_format`.  E.g., for
     "NHWC" format, shape is
     `[batch, out_height, out_width, in_channels * channel_multiplier].`
   """
   return depthwise_conv2d(input=input,
                           filter=filter,
                           strides=strides,
                           padding=padding,
                           rate=dilations,
                           name=name,
                           data_format=data_format)

 # pylint: enable=redefined-builtin


 # pylint: disable=redefined-builtin,line-too-long
 @tf_export(v1=["nn.separable_conv2d"])
 @dispatch.add_dispatch_support
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
                      strides,
                      padding,
                      rate=None,
                      name=None,
                      data_format=None,
                      dilations=None):
   """2-D convolution with separable filters.

   Performs a depthwise convolution that acts separately on channels followed by
   a pointwise convolution that mixes channels.  Note that this is separability
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.

   In detail, with the default NHWC format,

       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
           depthwise_filter[di, dj, q, r] *
           pointwise_filter[0, 0, q * channel_multiplier + r, k]

   `strides` controls the strides for the depthwise convolution only, since
   the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
   `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
   If any value in `rate` is greater than 1, we perform atrous depthwise
   convolution, in which case all values in the `strides` tensor must be equal
   to 1.

   Args:
     input: 4-D `Tensor` with shape according to `data_format`.
     depthwise_filter: 4-D `Tensor` with shape
       `[filter_height, filter_width, in_channels, channel_multiplier]`.
       Contains `in_channels` convolutional filters of depth 1.
     pointwise_filter: 4-D `Tensor` with shape
       `[1, 1, channel_multiplier * in_channels, out_channels]`.  Pointwise
       filter to mix channels after `depthwise_filter` has convolved spatially.
     strides: 1-D of size 4.  The strides for the depthwise convolution for
       each dimension of `input`.
     padding: Controls how to pad the image before applying the depthwise
       convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
       of padding algorithm to use, or a Python list indicating the explicit
       paddings at the start and end of each dimension. When explicit padding is
       used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
       [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
       padding used and data_format is `"NCHW"`, this should be in the form
       `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
     rate: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
     dilations: Alias of rate.

   Returns:
     A 4-D `Tensor` with shape according to 'data_format'. For
       example, with data_format="NHWC", shape is [batch, out_height,
       out_width, out_channels].
   """
   rate = deprecated_argument_lookup("dilations", dilations, "rate", rate)
   with ops.name_scope(name, "separable_conv2d",
                       [input, depthwise_filter, pointwise_filter]) as name:
     input = ops.convert_to_tensor(input, name="tensor_in")
     depthwise_filter = ops.convert_to_tensor(
         depthwise_filter, name="depthwise_filter")
     pointwise_filter = ops.convert_to_tensor(
         pointwise_filter, name="pointwise_filter")

     pointwise_filter_shape = pointwise_filter.get_shape().with_rank(4)
     pointwise_filter_shape.dims[0].assert_is_compatible_with(1)
     pointwise_filter_shape.dims[1].assert_is_compatible_with(1)

     if rate is None:
       rate = [1, 1]

     # The layout of the ops in the graph are expected to be as follows:
     # depthwise_conv2d  // Conv2D op corresponding to native depthwise conv.
     # separable_conv2d  // Conv2D op corresponding to the pointwise conv.

     def op(input_converted, _, padding):
       return nn_ops.depthwise_conv2d_native(
           input=input_converted,
           filter=depthwise_filter,
           strides=strides,
           padding=padding,
           data_format=data_format,
           name="depthwise")

     depthwise = nn_ops.with_space_to_batch(
         input=input,
         filter_shape=array_ops.shape(depthwise_filter),
         dilation_rate=rate,
         padding=padding,
         data_format=data_format,
         op=op)

     return nn_ops.conv2d(
         depthwise,
         pointwise_filter, [1, 1, 1, 1],
         padding="VALID",
         data_format=data_format,
         name=name)


 @tf_export("nn.separable_conv2d", v1=[])
 @dispatch.add_dispatch_support
 def separable_conv2d_v2(
     input,
     depthwise_filter,
     pointwise_filter,
     strides,
     padding,
     data_format=None,
     dilations=None,
     name=None,
 ):
   """2-D convolution with separable filters.

   Performs a depthwise convolution that acts separately on channels followed by
   a pointwise convolution that mixes channels.  Note that this is separability
   between dimensions `[1, 2]` and `3`, not spatial separability between
   dimensions `1` and `2`.

   In detail, with the default NHWC format,

       output[b, i, j, k] = sum_{di, dj, q, r}
           input[b, strides[1] * i + di, strides[2] * j + dj, q] *
           depthwise_filter[di, dj, q, r] *
           pointwise_filter[0, 0, q * channel_multiplier + r, k]

   `strides` controls the strides for the depthwise convolution only, since
   the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
   `strides[0] = strides[3] = 1`.  For the most common case of the same
   horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
   If any value in `rate` is greater than 1, we perform atrous depthwise
   convolution, in which case all values in the `strides` tensor must be equal
   to 1.

   Args:
     input: 4-D `Tensor` with shape according to `data_format`.
     depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
       in_channels, channel_multiplier]`. Contains `in_channels` convolutional
       filters of depth 1.
     pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
       in_channels, out_channels]`.  Pointwise filter to mix channels after
       `depthwise_filter` has convolved spatially.
     strides: 1-D of size 4.  The strides for the depthwise convolution for each
       dimension of `input`.
     padding: Controls how to pad the image before applying the depthwise
       convolution. Can be the string `"SAME"` or `"VALID"` indicating the type
       of padding algorithm to use, or a Python list indicating the explicit
       paddings at the start and end of each dimension. When explicit padding is
       used and data_format is `"NHWC"`, this should be in the form `[[0, 0],
       [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]]`. When explicit
       padding used and data_format is `"NCHW"`, this should be in the form
       `[[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]]`.
     data_format: The data format for input. Either "NHWC" (default) or "NCHW".
     dilations: 1-D of size 2. The dilation rate in which we sample input values
       across the `height` and `width` dimensions in atrous convolution. If it is
       greater than 1, then all values of strides must be 1.
     name: A name for this operation (optional).

   Returns:
     A 4-D `Tensor` with shape according to 'data_format'. For
       example, with data_format="NHWC", shape is [batch, out_height,
       out_width, out_channels].
   """
   return separable_conv2d(
       input,
       depthwise_filter,
       pointwise_filter,
       strides,
       padding,
       rate=dilations,
       name=name,
       data_format=data_format)

 # pylint: enable=redefined-builtin,line-too-long


 @tf_export(v1=["nn.sufficient_statistics"])
 @dispatch.add_dispatch_support
 def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
                           keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.

   These sufficient statistics are computed using the one pass algorithm on
   an input that's optionally shifted. See:
   https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

   For example:
   >>> t = [[1, 2, 3], [4, 5, 6]]
   >>> sufficient_statistics(t, [1])
   (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
   dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
   dtype=int32, numpy=array([14, 77], dtype=int32)>, None)
   >>> sufficient_statistics(t, [-1])
   (<tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(2,),
   dtype=int32, numpy=array([ 6, 15], dtype=int32)>, <tf.Tensor: shape=(2,),
   dtype=int32, numpy=array([14, 77], dtype=int32)>, None)

   Args:
     x: A `Tensor`.
     axes: Array of ints. Axes along which to compute mean and variance. As in
       Python, the axes can also be negative numbers. A negative axis is
       interpreted as counting from the end of the rank, i.e., axis +
       rank(values)-th dimension.
     shift: A `Tensor` containing the value by which to shift the data for
       numerical stability, or `None` if no shift is to be performed. A shift
       close to the true mean provides the most numerically stable results.
     keep_dims: produce statistics with the same dimensionality as the input.
     name: Name used to scope the operations that compute the sufficient stats.
     keepdims: Alias for keep_dims.

   Returns:
     Four `Tensor` objects of the same type as `x`:

     * the count (number of elements to average over).
     * the (possibly shifted) sum of the elements in the array.
     * the (possibly shifted) sum of squares of the elements in the array.
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
   axes = list(set(axes))
   keep_dims = deprecated_argument_lookup(
       "keepdims", keepdims, "keep_dims", keep_dims)
   if keep_dims is None:
     keep_dims = False
   with ops.name_scope(name, "sufficient_statistics", [x, shift]):
     x = ops.convert_to_tensor(x, name="x")
     x_shape = x.get_shape()
     if x_shape.rank is not None and all(
         x_shape.dims[d].value is not None for d in axes):
       counts = 1
       for d in axes:
         counts *= x_shape.dims[d].value
       counts = constant_op.constant(counts, dtype=x.dtype)
     else:  # shape needs to be inferred at runtime.
       # Normalize axes to be positive. Required for gather.
       rank = array_ops.rank(x)
       positive_axes = [axis + rank if axis < 0 else axis for axis in axes]
       x_dims = array_ops.gather(
           math_ops.cast(array_ops.shape(x), x.dtype), positive_axes)
       counts = math_ops.reduce_prod(x_dims, name="count")
     if shift is not None:
       shift = ops.convert_to_tensor(shift, name="shift")
       m_ss = math_ops.subtract(x, shift)
       v_ss = math_ops.squared_difference(x, shift)
     else:  # no shift.
       m_ss = x
       v_ss = math_ops.square(x)
     m_ss = math_ops.reduce_sum(m_ss, axes, keepdims=keep_dims, name="mean_ss")
     v_ss = math_ops.reduce_sum(v_ss, axes, keepdims=keep_dims, name="var_ss")
   return counts, m_ss, v_ss, shift


 @tf_export("nn.sufficient_statistics", v1=[])
 @dispatch.add_dispatch_support
 def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.

   These sufficient statistics are computed using the one pass algorithm on
   an input that's optionally shifted. See:
   https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data

   Args:
     x: A `Tensor`.
     axes: Array of ints. Axes along which to compute mean and variance.
     shift: A `Tensor` containing the value by which to shift the data for
       numerical stability, or `None` if no shift is to be performed. A shift
       close to the true mean provides the most numerically stable results.
     keepdims: produce statistics with the same dimensionality as the input.
     name: Name used to scope the operations that compute the sufficient stats.

   Returns:
     Four `Tensor` objects of the same type as `x`:

     * the count (number of elements to average over).
     * the (possibly shifted) sum of the elements in the array.
     * the (possibly shifted) sum of squares of the elements in the array.
     * the shift by which the mean must be corrected or None if `shift` is None.
   """
   return sufficient_statistics(
       x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)


 @tf_export("nn.normalize_moments")
 @dispatch.add_dispatch_support
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.

   Args:
     counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
       (possibly shifted) squared sum of the data to compute the variance over.
     shift: A `Tensor` containing the value by which the data is shifted for
       numerical stability, or `None` if no shift was performed.
     name: Name used to scope the operations that compute the moments.

   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
   with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]):
     divisor = math_ops.reciprocal(counts, name="divisor")
     if shift is not None:
       shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean")
       mean = math_ops.add(shifted_mean, shift, name="mean")
     else:  # no shift.
       shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean")
       mean = shifted_mean
     variance = math_ops.subtract(
         math_ops.multiply(variance_ss, divisor),
         math_ops.square(shifted_mean),
         name="variance")
   return (mean, variance)


 @tf_export(v1=["nn.moments"])
 @dispatch.add_dispatch_support
 def moments(
     x,
     axes,
     shift=None,  # pylint: disable=unused-argument
     name=None,
     keep_dims=None,
     keepdims=None):
   """Calculate the mean and variance of `x`.

   The mean and variance are calculated by aggregating the contents of `x`
   across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
   and variance of a vector.

   Note: shift is currently not used; the true mean is computed and used.

   When using these moments for batch normalization (see
   `tf.nn.batch_normalization`):

    * for so-called "global normalization", used with convolutional filters with
      shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
    * for simple batch normalization pass `axes=[0]` (batch only).

   Args:
     x: A `Tensor`.
     axes: Array of ints.  Axes along which to compute mean and
       variance.
     shift: Not used in the current implementation
     name: Name used to scope the operations that compute the moments.
     keep_dims: produce moments with the same dimensionality as the input.
     keepdims: Alias to keep_dims.

   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
   keep_dims = deprecated_argument_lookup(
       "keepdims", keepdims, "keep_dims", keep_dims)
   if keep_dims is None:
     keep_dims = False
   with ops.name_scope(name, "moments", [x, axes]):
     # The dynamic range of fp16 is too limited to support the collection of
     # sufficient statistics. As a workaround we simply perform the operations
     # on 32-bit floats before converting the mean and variance back to fp16
     y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
     # Note: stop_gradient does not change the gradient that gets
     #       backpropagated to the mean from the variance calculation,
     #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
         keepdims=True,
         name="variance")
     if not keep_dims:
       mean = array_ops.squeeze(mean, axes)
       variance = array_ops.squeeze(variance, axes)
     if x.dtype == dtypes.float16:
       return (math_ops.cast(mean, dtypes.float16),
               math_ops.cast(variance, dtypes.float16))
     else:
       return (mean, variance)


 @tf_export("nn.moments", v1=[])
 @dispatch.add_dispatch_support
 def moments_v2(
     x,
     axes,
     shift=None,
     keepdims=False,
     name=None):
   """Calculates the mean and variance of `x`.

   The mean and variance are calculated by aggregating the contents of `x`
   across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
   and variance of a vector.

   Note: shift is currently not used; the true mean is computed and used.

   When using these moments for batch normalization (see
   `tf.nn.batch_normalization`):

    * for so-called "global normalization", used with convolutional filters with
      shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
    * for simple batch normalization pass `axes=[0]` (batch only).

   Args:
     x: A `Tensor`.
     axes: Array of ints.  Axes along which to compute mean and
       variance.
     shift: Not used in the current implementation.
     keepdims: produce moments with the same dimensionality as the input.
     name: Name used to scope the operations that compute the moments.

   Returns:
     Two `Tensor` objects: `mean` and `variance`.
   """
   return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)


 @tf_export(v1=["nn.weighted_moments"])
 @dispatch.add_dispatch_support
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
                      keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.

   Args:
     x: A tensor.
     axes: 1-d tensor of int32 values; these are the axes along which
       to compute mean and variance.
     frequency_weights: A tensor of positive weights which can be
       broadcast with x.
     name: Name used to scope the operation.
     keep_dims: Produce moments with the same dimensionality as the input.
     keepdims: Alias of keep_dims.

   Returns:
     Two tensors: `weighted_mean` and `weighted_variance`.
   """
   keep_dims = deprecated_argument_lookup(
       "keepdims", keepdims, "keep_dims", keep_dims)
   if keep_dims is None:
     keep_dims = False
   with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]):
     x = ops.convert_to_tensor(x, name="x")
     frequency_weights = ops.convert_to_tensor(
         frequency_weights, name="frequency_weights")

     # Unlike moments(), this just uses a simpler two-pass method.

     # See comment in moments() WRT precision; it applies here too.
     needs_cast = x.dtype == dtypes.float16
     if needs_cast:
       x = math_ops.cast(x, dtypes.float32)

     if frequency_weights.dtype != x.dtype:
       frequency_weights = math_ops.cast(frequency_weights, x.dtype)

     # Note that we use keep_dims=True for our reductions regardless of the arg;
     # this is so that the results remain broadcast-compatible with the inputs.
     weighted_input_sum = math_ops.reduce_sum(
         frequency_weights * x, axes, name="weighted_input_sum", keepdims=True)

     # The shape of the weights isn't necessarily the same as x's
     # shape, just broadcast-compatible with it -- so this expression
     # performs broadcasting to give a per-item weight, with the same
     # shape as (frequency_weights * x). This avoids having to reason
     # through all the broadcast logic to compute a correct
     # sum_of_weights.
     broadcasted_weights = frequency_weights + array_ops.zeros_like(x)

     sum_of_weights = math_ops.reduce_sum(
         broadcasted_weights, axes, name="sum_of_weights", keepdims=True)

     weighted_mean = math_ops.div_no_nan(weighted_input_sum, sum_of_weights)

     # Have the weighted mean; now on to variance:
     weighted_distsq = math_ops.reduce_sum(
         frequency_weights * math_ops.squared_difference(x, weighted_mean),
         axes,
         name="weighted_distsq",
         keepdims=True)

     weighted_variance = math_ops.div_no_nan(weighted_distsq, sum_of_weights)

     if not keep_dims:
       weighted_mean = array_ops.squeeze(weighted_mean, axis=axes)
       weighted_variance = array_ops.squeeze(
           weighted_variance, axis=axes)

     if needs_cast:
       weighted_mean = math_ops.cast(weighted_mean, dtypes.float16)
       weighted_variance = math_ops.cast(weighted_variance, dtypes.float16)

     return weighted_mean, weighted_variance


 @tf_export("nn.weighted_moments", v1=[])
 @dispatch.add_dispatch_support
 def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
   """Returns the frequency-weighted mean and variance of `x`.

   Args:
     x: A tensor.
     axes: 1-d tensor of int32 values; these are the axes along which
       to compute mean and variance.
     frequency_weights: A tensor of positive weights which can be
       broadcast with x.
     keepdims: Produce moments with the same dimensionality as the input.
     name: Name used to scope the operation.

   Returns:
     Two tensors: `weighted_mean` and `weighted_variance`.
   """
   return weighted_moments(
       x=x,
       axes=axes,
       frequency_weights=frequency_weights,
       name=name,
       keep_dims=keepdims)


 @tf_export("nn.batch_normalization")
 @dispatch.add_dispatch_support
 def batch_normalization(x,
                         mean,
                         variance,
                         offset,
                         scale,
                         variance_epsilon,
                         name=None):
   r"""Batch normalization.

   Normalizes a tensor by `mean` and `variance`, and applies (optionally) a
   `scale` \\(\gamma\\) to it, as well as an `offset` \\(\beta\\):

   \\(\frac{\gamma(x-\mu)}{\sigma}+\beta\\)

   `mean`, `variance`, `offset` and `scale` are all expected to be of one of two
   shapes:

     * In all generality, they can have the same number of dimensions as the
       input `x`, with identical sizes as `x` for the dimensions that are not
       normalized over (the 'depth' dimension(s)), and dimension 1 for the
       others which are being normalized over.
       `mean` and `variance` in this case would typically be the outputs of
       `tf.nn.moments(..., keepdims=True)` during training, or running averages
       thereof during inference.
     * In the common case where the 'depth' dimension is the last dimension in
       the input tensor `x`, they may be one dimensional tensors of the same
       size as the 'depth' dimension.
       This is the case for example for the common `[batch, depth]` layout of
       fully-connected layers, and `[batch, height, width, depth]` for
       convolutions.
       `mean` and `variance` in this case would typically be the outputs of
       `tf.nn.moments(..., keepdims=False)` during training, or running averages
       thereof during inference.

   See equation 11 in Algorithm 2 of source:
   [Batch Normalization: Accelerating Deep Network Training by
   Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
   (http://arxiv.org/abs/1502.03167).

   Args:
     x: Input `Tensor` of arbitrary dimensionality.
     mean: A mean `Tensor`.
     variance: A variance `Tensor`.
     offset: An offset `Tensor`, often denoted \\(\beta\\) in equations, or
       None. If present, will be added to the normalized tensor.
     scale: A scale `Tensor`, often denoted \\(\gamma\\) in equations, or
       `None`. If present, the scale is applied to the normalized tensor.
     variance_epsilon: A small float number to avoid dividing by 0.
     name: A name for this operation (optional).

   Returns:
     the normalized, scaled, offset tensor.

   References:
     Batch Normalization - Accelerating Deep Network Training by Reducing
     Internal Covariate Shift:
       [Ioffe et al., 2015](http://arxiv.org/abs/1502.03167)
       ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
   """
   with ops.name_scope(name, "batchnorm", [x, mean, variance, scale, offset]):
     inv = math_ops.rsqrt(variance + variance_epsilon)
     if scale is not None:
       inv *= scale
     # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on
     # the precise order of ops that are generated by the expression below.
     return x * math_ops.cast(inv, x.dtype) + math_ops.cast(
         offset - mean * inv if offset is not None else -mean * inv, x.dtype)


 @tf_export(v1=["nn.fused_batch_norm"])
 @dispatch.add_dispatch_support
 def fused_batch_norm(
     x,
     scale,
     offset,  # pylint: disable=invalid-name
     mean=None,
     variance=None,
     epsilon=0.001,
     data_format="NHWC",
     is_training=True,
     name=None,
     exponential_avg_factor=1.0):
   r"""Batch normalization.


   See Source: [Batch Normalization: Accelerating Deep Network Training by
   Reducing Internal Covariate Shift; S. Ioffe, C. Szegedy]
   (http://arxiv.org/abs/1502.03167).

   Args:
     x: Input `Tensor` of 4 or 5 dimensions.
     scale: A `Tensor` of 1 dimension for scaling.
     offset: A `Tensor` of 1 dimension for bias.
     mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
           of this argument depends on the value of is_training and
           exponential_avg_factor as follows:
           is_training==False (inference):
             Mean must be a `Tensor` of the same shape as scale containing the
             estimated population mean computed during training.
           is_training==True and exponential_avg_factor == 1.0:
             Mean must be None.
           is_training==True and exponential_avg_factor != 1.0:
             Mean must be a `Tensor` of the same shape as scale containing the
             exponential running mean.
     variance: A `Tensor` of 1 dimension for population variance. The shape and
           meaning of this argument depends on the value of is_training and
           exponential_avg_factor as follows:
           is_training==False (inference):
             Variance must be a `Tensor` of the same shape as scale containing
             the estimated population variance computed during training.
           is_training==True and exponential_avg_factor == 1.0:
             Variance must be None.
           is_training==True and exponential_avg_factor != 1.0:
             Variance must be a `Tensor` of the same shape as scale containing
             the exponential running variance.
     epsilon: A small float number added to the variance of x.
     data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
                  4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
     is_training: A bool value to specify if the operation is used for
                  training or inference.
     name: A name for this operation (optional).
     exponential_avg_factor: A float number (usually between 0 and 1) used
                             for controlling the decay of the running
                             population average of mean and variance.
                             If set to 1.0, the current batch average is
                             returned.

   Returns:
     y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
     running_mean: A 1D Tensor for the exponential running mean of x.
                   The output value is (1 - exponential_avg_factor) * mean +
                   exponential_avg_factor * batch_mean), where batch_mean
                   is the mean of the current batch in x.
     running_var: A 1D Tensor for the exponential running variance
                  The output value is (1 - exponential_avg_factor) * variance +
                  exponential_avg_factor * batch_variance), where batch_variance
                  is the variance of the current batch in x.

   References:
     Batch Normalization - Accelerating Deep Network Training by Reducing
     Internal Covariate Shift:
       [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
       ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
   """
   if (not is_training or exponential_avg_factor != 1.0) and (
       (mean is None) or (variance is None)):
     raise ValueError("Both `mean` and `variance` must be a 1D tensor when "
                      "`is_training` is False or `exponential_avg_factor` != "
                      f"1.0. Received: `mean` {mean!r} and `variance` "
                      f"{variance!r}")
   x = ops.convert_to_tensor(x, name="input")
   scale = ops.convert_to_tensor(scale, name="scale")
   offset = ops.convert_to_tensor(offset, name="offset")
   if mean is None:
     mean = constant_op.constant([])
   if variance is None:
     variance = constant_op.constant([])

   y, running_mean, running_var, _, _, _ = gen_nn_ops.fused_batch_norm_v3(
       x,
       scale,
       offset,
       mean,
       variance,
       epsilon=epsilon,
       exponential_avg_factor=exponential_avg_factor,
       data_format=data_format,
       is_training=is_training,
       name=name)
   return y, running_mean, running_var


 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
 @dispatch.add_dispatch_support
 def batch_norm_with_global_normalization(t=None,
                                          m=None,
                                          v=None,
                                          beta=None,
                                          gamma=None,
                                          variance_epsilon=None,
                                          scale_after_normalization=None,
                                          name=None,
                                          input=None,  # pylint: disable=redefined-builtin
                                          mean=None,
                                          variance=None):
   """Batch normalization.

   This op is deprecated. See `tf.nn.batch_normalization`.

   Args:
     t: A 4D input Tensor.
     m: A 1D mean Tensor with size matching the last dimension of t.
       This is the first output from tf.nn.moments,
       or a saved moving average thereof.
     v: A 1D variance Tensor with size matching the last dimension of t.
       This is the second output from tf.nn.moments,
       or a saved moving average thereof.
     beta: A 1D beta Tensor with size matching the last dimension of t.
       An offset to be added to the normalized tensor.
     gamma: A 1D gamma Tensor with size matching the last dimension of t.
       If "scale_after_normalization" is true, this tensor will be multiplied
       with the normalized tensor.
     variance_epsilon: A small float number to avoid dividing by 0.
     scale_after_normalization: A bool indicating whether the resulted tensor
       needs to be multiplied with gamma.
     name: A name for this operation (optional).
     input: Alias for t.
     mean: Alias for m.
     variance: Alias for v.

   Returns:
      A batch-normalized `t`.

   References:
     Batch Normalization - Accelerating Deep Network Training by Reducing
     Internal Covariate Shift:
       [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
       ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
   """
   t = deprecated_argument_lookup("input", input, "t", t)
   m = deprecated_argument_lookup("mean", mean, "m", m)
   v = deprecated_argument_lookup("variance", variance, "v", v)
   return batch_normalization(t, m, v, beta, gamma if scale_after_normalization
                              else None, variance_epsilon, name)


 # pylint: disable=redefined-builtin,line-too-long
 @tf_export("nn.batch_norm_with_global_normalization", v1=[])
 @dispatch.add_dispatch_support
 def batch_norm_with_global_normalization_v2(input,
                                             mean,
                                             variance,
                                             beta,
                                             gamma,
                                             variance_epsilon,
                                             scale_after_normalization,
                                             name=None):
   """Batch normalization.

   This op is deprecated. See `tf.nn.batch_normalization`.

   Args:
     input: A 4D input Tensor.
     mean: A 1D mean Tensor with size matching the last dimension of t.
       This is the first output from tf.nn.moments,
       or a saved moving average thereof.
     variance: A 1D variance Tensor with size matching the last dimension of t.
       This is the second output from tf.nn.moments,
       or a saved moving average thereof.
     beta: A 1D beta Tensor with size matching the last dimension of t.
       An offset to be added to the normalized tensor.
     gamma: A 1D gamma Tensor with size matching the last dimension of t.
       If "scale_after_normalization" is true, this tensor will be multiplied
       with the normalized tensor.
     variance_epsilon: A small float number to avoid dividing by 0.
     scale_after_normalization: A bool indicating whether the resulted tensor
       needs to be multiplied with gamma.
     name: A name for this operation (optional).

   Returns:
      A batch-normalized `t`.

   References:
     Batch Normalization - Accelerating Deep Network Training by Reducing Internal Covariate Shift:
       [Ioffe et al., 2015](http://proceedings.mlr.press/v37/ioffe15.html)
       ([pdf](http://proceedings.mlr.press/v37/ioffe15.pdf))
   """
   return batch_norm_with_global_normalization(t=input,
                                               m=mean,
                                               v=variance,
                                               beta=beta,
                                               gamma=gamma,
                                               variance_epsilon=variance_epsilon,
                                               scale_after_normalization=scale_after_normalization,
                                               name=name)

 # pylint: enable=redefined-builtin,line-too-long


 def _sum_rows(x):
   """Returns a vector summing up each row of the matrix x."""
   # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
   # a matrix.  The gradient of _sum_rows(x) is more efficient than
   # reduce_sum(x, 1)'s gradient in today's implementation. Therefore,
   # we use _sum_rows(x) in the nce_loss() computation since the loss
   # is mostly used for training.
   cols = array_ops.shape(x)[1]
   ones_shape = array_ops_stack.stack([cols, 1])
   ones = array_ops.ones(ones_shape, x.dtype)
   return array_ops.reshape(math_ops.matmul(x, ones), [-1])


 def _compute_sampled_logits(weights,
                             biases,
                             labels,
                             inputs,
                             num_sampled,
                             num_classes,
                             num_true=1,
                             sampled_values=None,
                             subtract_log_q=True,
                             remove_accidental_hits=False,
                             partition_strategy="mod",
                             name=None,
                             seed=None):
   """Helper function for nce_loss and sampled_softmax_loss functions.

   Computes sampled output training logits and labels suitable for implementing
   e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see
   sampled_softmax_loss).

   Note: In the case where num_true > 1, we assign to each target class
   the target probability 1 / num_true so that the target probabilities
   sum to 1 per-example.

   Args:
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
         objects whose concatenation along dimension 0 has shape
         `[num_classes, dim]`.  The (possibly-partitioned) class embeddings.
     biases: A `Tensor` of shape `[num_classes]`.  The (possibly-partitioned)
         class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
         the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
         `sampled_expected_count`) returned by a `*_candidate_sampler` function.
         (if None, we default to `log_uniform_candidate_sampler`)
     subtract_log_q: A `bool`.  whether to subtract the log expected count of
         the labels in the sample to get the logits of the true labels.
         Default is True.  Turn off for Negative Sampling.
     remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
         where a sampled class equals one of the target classes.  Default is
         False.
     partition_strategy: A string specifying the partitioning strategy, relevant
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
     seed: random seed for candidate sampling. Default to None, which doesn't set
         the op-level random seed for candidate sampling.
   Returns:
     out_logits: `Tensor` object with shape
         `[batch_size, num_true + num_sampled]`, for passing to either
         `nn.sigmoid_cross_entropy_with_logits` (NCE) or
         `nn.softmax_cross_entropy_with_logits` (sampled softmax).
     out_labels: A Tensor object with the same shape as `out_logits`.
   """

   if isinstance(weights, variables.PartitionedVariable):
     weights = list(weights)
   if not isinstance(weights, list):
     weights = [weights]

   with ops.name_scope(name, "compute_sampled_logits",
                       weights + [biases, inputs, labels]):
     if labels.dtype != dtypes.int64:
       labels = math_ops.cast(labels, dtypes.int64)
     labels_flat = array_ops.reshape(labels, [-1])

     # Sample the negative labels.
     #   sampled shape: [num_sampled] tensor
     #   true_expected_count shape = [batch_size, 1] tensor
     #   sampled_expected_count shape = [num_sampled] tensor
     if sampled_values is None:
       sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
           true_classes=labels,
           num_true=num_true,
           num_sampled=num_sampled,
           unique=True,
           range_max=num_classes,
           seed=seed)
     # NOTE: pylint cannot tell that 'sampled_values' is a sequence
     # pylint: disable=unpacking-non-sequence
     sampled, true_expected_count, sampled_expected_count = (
         array_ops.stop_gradient(s) for s in sampled_values)
     # pylint: enable=unpacking-non-sequence
     sampled = math_ops.cast(sampled, dtypes.int64)

     # labels_flat is a [batch_size * num_true] tensor
     # sampled is a [num_sampled] int tensor
     all_ids = array_ops.concat([labels_flat, sampled], 0)

     # Retrieve the true weights and the logits of the sampled weights.

     # weights shape is [num_classes, dim]
     all_w = embedding_ops.embedding_lookup(
         weights, all_ids, partition_strategy=partition_strategy)
     if all_w.dtype != inputs.dtype:
       all_w = math_ops.cast(all_w, inputs.dtype)

     # true_w shape is [batch_size * num_true, dim]
     true_w = array_ops.slice(all_w, [0, 0],
                              array_ops_stack.stack(
                                  [array_ops.shape(labels_flat)[0], -1]))

     sampled_w = array_ops.slice(
         all_w,
         array_ops_stack.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1])
     # inputs has shape [batch_size, dim]
     # sampled_w has shape [num_sampled, dim]
     # Apply X*W', which yields [batch_size, num_sampled]
     sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True)

     # Retrieve the true and sampled biases, compute the true logits, and
     # add the biases to the true and sampled logits.
     all_b = embedding_ops.embedding_lookup(
         biases, all_ids, partition_strategy=partition_strategy)
     if all_b.dtype != inputs.dtype:
       all_b = math_ops.cast(all_b, inputs.dtype)
     # true_b is a [batch_size * num_true] tensor
     # sampled_b is a [num_sampled] float tensor
     true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))
     sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

     # inputs shape is [batch_size, dim]
     # true_w shape is [batch_size * num_true, dim]
     # row_wise_dots is [batch_size, num_true, dim]
     dim = array_ops.shape(true_w)[1:2]
     new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0)
     row_wise_dots = math_ops.multiply(
         array_ops.expand_dims(inputs, 1),
         array_ops.reshape(true_w, new_true_w_shape))
     # We want the row-wise dot plus biases which yields a
     # [batch_size, num_true] tensor of true_logits.
     dots_as_matrix = array_ops.reshape(row_wise_dots,
                                        array_ops.concat([[-1], dim], 0))
     true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true])
     true_b = array_ops.reshape(true_b, [-1, num_true])
     true_logits += true_b
     sampled_logits += sampled_b

     if remove_accidental_hits:
       acc_hits = candidate_sampling_ops.compute_accidental_hits(
           labels, sampled, num_true=num_true)
       acc_indices, acc_ids, acc_weights = acc_hits

       # This is how SparseToDense expects the indices.
       acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
       acc_ids_2d_int32 = array_ops.reshape(
           math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
       sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1,
                                         "sparse_indices")
       # Create sampled_logits_shape = [batch_size, num_sampled]
       sampled_logits_shape = array_ops.concat(
           [array_ops.shape(labels)[:1],
            array_ops.expand_dims(num_sampled, 0)], 0)
       if sampled_logits.dtype != acc_weights.dtype:
         acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
       sampled_logits += gen_sparse_ops.sparse_to_dense(
           sparse_indices,
           sampled_logits_shape,
           acc_weights,
           default_value=0.0,
           validate_indices=False)

     if subtract_log_q:
       # Subtract log of Q(l), prior probability that l appears in sampled.
       true_logits -= math_ops.log(true_expected_count)
       sampled_logits -= math_ops.log(sampled_expected_count)

     # Construct output logits and labels. The true labels/logits start at col 0.
     out_logits = array_ops.concat([true_logits, sampled_logits], 1)

     # true_logits is a float tensor, ones_like(true_logits) is a float
     # tensor of ones. We then divide by num_true to ensure the per-example
     # labels sum to 1.0, i.e. form a proper probability distribution.
     out_labels = array_ops.concat([
         array_ops.ones_like(true_logits) / num_true,
         array_ops.zeros_like(sampled_logits)
     ], 1)

     return out_logits, out_labels


 @tf_export("nn.nce_loss", v1=[])
 @dispatch.add_dispatch_support
 def nce_loss_v2(weights,
                 biases,
                 labels,
                 inputs,
                 num_sampled,
                 num_classes,
                 num_true=1,
                 sampled_values=None,
                 remove_accidental_hits=False,
                 name="nce_loss"):
   """Computes and returns the noise-contrastive estimation training loss.

   See [Noise-contrastive estimation: A new estimation principle for
   unnormalized statistical
   models](https://arxiv.org/abs/1806.03664).
   Also see our [Candidate Sampling Algorithms
   Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)

   A common use case is to use this method for training, and calculate the full
   sigmoid loss for evaluation or inference as in the following example:

   ```python
   if mode == "train":
     loss = tf.nn.nce_loss(
         weights=weights,
         biases=biases,
         labels=labels,
         inputs=inputs,
         ...)
   elif mode == "eval":
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
     loss = tf.nn.sigmoid_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
     loss = tf.reduce_sum(loss, axis=1)
   ```

   Note: when doing embedding lookup on `weights` and `bias`, "div" partition
   strategy will be used. Support for other partition strategy will be added
   later.

   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
   `tf.random.log_uniform_candidate_sampler`.

   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
   sum to 1 per-example.

   Note: It would be useful to allow a variable number of target classes per
   example.  We hope to provide this functionality in a future release.
   For now, if you have a variable number of target classes, you can pad them
   out to a constant number by either repeating them or by padding
   with an otherwise unused class.

   Args:
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
       objects whose concatenation along dimension 0 has shape [num_classes,
       dim].  The (possibly-partitioned) class embeddings.
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
       target classes.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
       the input network.
     num_sampled: An `int`.  The number of negative classes to randomly sample
       per batch. This single sample of negative classes is evaluated for each
       element in the batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
       `sampled_expected_count`) returned by a `*_candidate_sampler` function.
       (if None, we default to `log_uniform_candidate_sampler`)
     remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
       where a sampled class equals one of the target classes.  If set to `True`,
       this is a "Sampled Logistic" loss instead of NCE, and we are learning to
       generate log-odds instead of log probabilities.  See our [Candidate
       Sampling Algorithms Reference]
         (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
           False.
     name: A name for the operation (optional).

   Returns:
     A `batch_size` 1-D tensor of per-example NCE losses.
   """
   # TODO(yuefengz): get partition_strategy from either variables or distribution
   # strategies.
   return nce_loss(
       weights,
       biases,
       labels,
       inputs,
       num_sampled,
       num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy="div",
       name=name)


 @tf_export(v1=["nn.nce_loss"])
 @dispatch.add_dispatch_support
 def nce_loss(weights,
              biases,
              labels,
              inputs,
              num_sampled,
              num_classes,
              num_true=1,
              sampled_values=None,
              remove_accidental_hits=False,
              partition_strategy="mod",
              name="nce_loss"):
   """Computes and returns the noise-contrastive estimation training loss.

   A common use case is to use this method for training, and calculate the full
   sigmoid loss for evaluation or inference. In this case, you must set
   `partition_strategy="div"` for the two losses to be consistent, as in the
   following example:

   ```python
   if mode == "train":
     loss = tf.nn.nce_loss(
         weights=weights,
         biases=biases,
         labels=labels,
         inputs=inputs,
         ...,
         partition_strategy="div")
   elif mode == "eval":
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
     loss = tf.nn.sigmoid_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
     loss = tf.reduce_sum(loss, axis=1)
   ```

   Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
   so your labels must be sorted in order of decreasing frequency to achieve
   good results.  For more details, see
   `tf.random.log_uniform_candidate_sampler`.

   Note: In the case where `num_true` > 1, we assign to each target class
   the target probability 1 / `num_true` so that the target probabilities
   sum to 1 per-example.

   Note: It would be useful to allow a variable number of target classes per
   example.  We hope to provide this functionality in a future release.
   For now, if you have a variable number of target classes, you can pad them
   out to a constant number by either repeating them or by padding
   with an otherwise unused class.

   Args:
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
         objects whose concatenation along dimension 0 has shape
         [num_classes, dim].  The (possibly-partitioned) class embeddings.
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of negative classes to randomly sample
         per batch. This single sample of negative classes is evaluated for each
         element in the batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
         `sampled_expected_count`) returned by a `*_candidate_sampler` function.
         (if None, we default to `log_uniform_candidate_sampler`)
     remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
         where a sampled class equals one of the target classes.  If set to
         `True`, this is a "Sampled Logistic" loss instead of NCE, and we are
         learning to generate log-odds instead of log probabilities. See
         our Candidate Sampling Algorithms Reference
         ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
         Default is False.
     partition_strategy: A string specifying the partitioning strategy, relevant
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).

   Returns:
     A `batch_size` 1-D tensor of per-example NCE losses.

   References:
     Noise-contrastive estimation - A new estimation principle for unnormalized
     statistical models:
       [Gutmann et al., 2010](http://proceedings.mlr.press/v9/gutmann10a)
       ([pdf](http://proceedings.mlr.press/v9/gutmann10a/gutmann10a.pdf))
   """
   logits, labels = _compute_sampled_logits(
       weights=weights,
       biases=biases,
       labels=labels,
       inputs=inputs,
       num_sampled=num_sampled,
       num_classes=num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
       name=name)
   sampled_losses = sigmoid_cross_entropy_with_logits(
       labels=labels, logits=logits, name="sampled_losses")
   # sampled_losses is batch_size x {true_loss, sampled_losses...}
   # We sum out true and sampled losses.
   return _sum_rows(sampled_losses)


 @tf_export("nn.sampled_softmax_loss", v1=[])
 @dispatch.add_dispatch_support
 def sampled_softmax_loss_v2(weights,
                             biases,
                             labels,
                             inputs,
                             num_sampled,
                             num_classes,
                             num_true=1,
                             sampled_values=None,
                             remove_accidental_hits=True,
                             seed=None,
                             name="sampled_softmax_loss"):
   """Computes and returns the sampled softmax training loss.

   This is a faster way to train a softmax classifier over a huge number of
   classes.

   This operation is for training only.  It is generally an underestimate of
   the full softmax loss.

   A common use case is to use this method for training, and calculate the full
   softmax loss for evaluation or inference as in the following example:

   ```python
   if mode == "train":
     loss = tf.nn.sampled_softmax_loss(
         weights=weights,
         biases=biases,
         labels=labels,
         inputs=inputs,
         ...)
   elif mode == "eval":
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
     loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
   ```

   See our [Candidate Sampling Algorithms Reference]
   (https://www.tensorflow.org/extras/candidate_sampling.pdf)

   Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
   ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.

   Note: when doing embedding lookup on `weights` and `bias`, "div" partition
   strategy will be used. Support for other partition strategy will be added
   later.

   Args:
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
       objects whose concatenation along dimension 0 has shape [num_classes,
       dim].  The (possibly-sharded) class embeddings.
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
       target classes.  Note that this format differs from the `labels` argument
       of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
       the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
       `sampled_expected_count`) returned by a `*_candidate_sampler` function.
       (if None, we default to `log_uniform_candidate_sampler`)
     remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
       where a sampled class equals one of the target classes.  Default is True.
     seed: random seed for candidate sampling. Default to None, which doesn't set
       the op-level random seed for candidate sampling.
     name: A name for the operation (optional).

   Returns:
     A `batch_size` 1-D tensor of per-example sampled softmax losses.

   """
   return sampled_softmax_loss(
       weights,
       biases,
       labels,
       inputs,
       num_sampled,
       num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy="div",
       name=name,
       seed=seed)


 @tf_export(v1=["nn.sampled_softmax_loss"])
 @dispatch.add_dispatch_support
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
                          inputs,
                          num_sampled,
                          num_classes,
                          num_true=1,
                          sampled_values=None,
                          remove_accidental_hits=True,
                          partition_strategy="mod",
                          name="sampled_softmax_loss",
                          seed=None):
   """Computes and returns the sampled softmax training loss.

   This is a faster way to train a softmax classifier over a huge number of
   classes.

   This operation is for training only.  It is generally an underestimate of
   the full softmax loss.

   A common use case is to use this method for training, and calculate the full
   softmax loss for evaluation or inference. In this case, you must set
   `partition_strategy="div"` for the two losses to be consistent, as in the
   following example:

   ```python
   if mode == "train":
     loss = tf.nn.sampled_softmax_loss(
         weights=weights,
         biases=biases,
         labels=labels,
         inputs=inputs,
         ...,
         partition_strategy="div")
   elif mode == "eval":
     logits = tf.matmul(inputs, tf.transpose(weights))
     logits = tf.nn.bias_add(logits, biases)
     labels_one_hot = tf.one_hot(labels, n_classes)
     loss = tf.nn.softmax_cross_entropy_with_logits(
         labels=labels_one_hot,
         logits=logits)
   ```

   See our Candidate Sampling Algorithms Reference
   ([pdf](https://www.tensorflow.org/extras/candidate_sampling.pdf)).
   Also see Section 3 of (Jean et al., 2014) for the math.

   Args:
     weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
         objects whose concatenation along dimension 0 has shape
         [num_classes, dim].  The (possibly-sharded) class embeddings.
     biases: A `Tensor` of shape `[num_classes]`.  The class biases.
     labels: A `Tensor` of type `int64` and shape `[batch_size,
         num_true]`. The target classes.  Note that this format differs from
         the `labels` argument of `nn.softmax_cross_entropy_with_logits`.
     inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward
         activations of the input network.
     num_sampled: An `int`.  The number of classes to randomly sample per batch.
     num_classes: An `int`. The number of possible classes.
     num_true: An `int`.  The number of target classes per training example.
     sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
         `sampled_expected_count`) returned by a `*_candidate_sampler` function.
         (if None, we default to `log_uniform_candidate_sampler`)
     remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
         where a sampled class equals one of the target classes.  Default is
         True.
     partition_strategy: A string specifying the partitioning strategy, relevant
         if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported.
         Default is `"mod"`. See `tf.nn.embedding_lookup` for more details.
     name: A name for the operation (optional).
     seed: random seed for candidate sampling. Default to None, which doesn't set
         the op-level random seed for candidate sampling.

   Returns:
     A `batch_size` 1-D tensor of per-example sampled softmax losses.

   References:
     On Using Very Large Target Vocabulary for Neural Machine Translation:
       [Jean et al., 2014]
       (https://aclanthology.coli.uni-saarland.de/papers/P15-1001/p15-1001)
       ([pdf](http://aclweb.org/anthology/P15-1001))
   """
   logits, labels = _compute_sampled_logits(
       weights=weights,
       biases=biases,
       labels=labels,
       inputs=inputs,
       num_sampled=num_sampled,
       num_classes=num_classes,
       num_true=num_true,
       sampled_values=sampled_values,
       subtract_log_q=True,
       remove_accidental_hits=remove_accidental_hits,
       partition_strategy=partition_strategy,
       name=name,
       seed=seed)
   labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
   sampled_losses = nn_ops.softmax_cross_entropy_with_logits_v2(
       labels=labels, logits=logits)
   # sampled_losses is a [batch_size] tensor.
   return sampled_losses