| # Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
| # |
| # Licensed under the Apache License, Version 2.0 (the "License"); |
| # you may not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an "AS IS" BASIS, |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # ============================================================================== |
| """Gradients for operators defined in nn_ops.py.""" |
| |
| import functools |
| import itertools |
| import operator |
| |
| from tensorflow.python.eager import backprop |
| from tensorflow.python.framework import dtypes |
| from tensorflow.python.framework import ops |
| from tensorflow.python.ops import array_ops |
| from tensorflow.python.ops import array_ops_stack |
| from tensorflow.python.ops import gen_nn_ops |
| from tensorflow.python.ops import math_ops |
| from tensorflow.python.ops import nn_ops |
| |
| |
| @ops.RegisterGradient("Conv2DBackpropInput") |
| def _Conv2DBackpropInputGrad(op, grad): |
| """The derivatives for deconvolution. |
| |
| Args: |
| op: the Deconvolution op. |
| grad: the tensor representing the gradient w.r.t. the output |
| |
| Returns: |
| the gradients w.r.t. the input and the filter |
| """ |
| # We call the gen_nn_ops backprop functions instead of nn_ops backprop |
| # functions for performance reasons in Eager mode. See _Conv2DGrad. |
| return [ |
| None, |
| gen_nn_ops.conv2d_backprop_filter( |
| grad, |
| array_ops.shape(op.inputs[1]), |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), |
| data_format=op.get_attr("data_format").decode()), |
| gen_nn_ops.conv2d( |
| grad, |
| op.inputs[1], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), |
| data_format=op.get_attr("data_format").decode()) |
| ] |
| |
| |
| @ops.RegisterGradient("Conv2DBackpropFilter") |
| def _Conv2DBackpropFilterGrad(op, grad): |
| # We call the gen_nn_ops backprop functions instead of nn_ops backprop |
| # functions for performance reasons in Eager mode. See _Conv2DGrad. |
| return [ |
| gen_nn_ops.conv2d_backprop_input( |
| array_ops.shape(op.inputs[0]), |
| grad, |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), |
| data_format=op.get_attr("data_format").decode()), None, |
| gen_nn_ops.conv2d( |
| op.inputs[0], |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| use_cudnn_on_gpu=op.get_attr("use_cudnn_on_gpu"), |
| data_format=op.get_attr("data_format").decode()) |
| ] |
| |
| |
| @ops.RegisterGradient("DepthwiseConv2dNativeBackpropInput") |
| def _DepthwiseConv2dNativeBackpropInputGrad(op, grad): |
| """The derivatives for deconvolution. |
| |
| Args: |
| op: the Deconvolution op. |
| grad: the tensor representing the gradient w.r.t. the output |
| |
| Returns: |
| the gradients w.r.t. the input and the filter |
| """ |
| return [ |
| None, |
| gen_nn_ops.depthwise_conv2d_native_backprop_filter( |
| grad, |
| array_ops.shape(op.inputs[1]), |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")), |
| gen_nn_ops.depthwise_conv2d_native( |
| grad, |
| op.inputs[1], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")) |
| ] |
| |
| |
| @ops.RegisterGradient("DepthwiseConv2dNativeBackpropFilter") |
| def _DepthwiseConv2dNativeBackpropFilterGrad(op, grad): |
| return [ |
| gen_nn_ops.depthwise_conv2d_native_backprop_input( |
| array_ops.shape(op.inputs[0]), |
| grad, |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")), None, |
| gen_nn_ops.depthwise_conv2d_native( |
| op.inputs[0], |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")) |
| ] |
| |
| |
| @ops.RegisterGradient("Conv3D") |
| def _Conv3DGrad(op, grad): |
| data_format = op.get_attr("data_format").decode() |
| shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]]) |
| return [ |
| nn_ops.conv3d_backprop_input_v2( |
| shape_0, |
| op.inputs[1], |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format), |
| nn_ops.conv3d_backprop_filter_v2( |
| op.inputs[0], |
| shape_1, |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format) |
| ] |
| |
| |
| @ops.RegisterGradient("Conv3DBackpropInputV2") |
| def _Conv3DBackpropInputGrad(op, grad): |
| data_format = op.get_attr("data_format").decode() |
| return [ |
| None, |
| nn_ops.conv3d_backprop_filter_v2( |
| grad, |
| array_ops.shape(op.inputs[1]), |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format), |
| nn_ops.conv3d( |
| grad, |
| op.inputs[1], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format) |
| ] |
| |
| |
| @ops.RegisterGradient("Conv3DBackpropFilterV2") |
| def _Conv3DBackpropFilterGrad(op, grad): |
| data_format = op.get_attr("data_format").decode() |
| return [ |
| nn_ops.conv3d_backprop_input_v2( |
| array_ops.shape(op.inputs[0]), |
| grad, |
| op.inputs[2], |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format), None, |
| nn_ops.conv3d( |
| op.inputs[0], |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=data_format) |
| ] |
| |
| |
| @ops.RegisterGradient("AvgPool3D") |
| def _AvgPool3DGrad(op, grad): |
| return gen_nn_ops.avg_pool3d_grad( |
| array_ops.shape(op.inputs[0]), |
| grad, |
| ksize=op.get_attr("ksize"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format").decode()) |
| |
| |
| @ops.RegisterGradient("AvgPool3DGrad") |
| def _AvgPool3DGradGrad(op, grad): |
| return (array_ops.stop_gradient(op.inputs[0]), |
| gen_nn_ops.avg_pool3d( |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| op.get_attr("padding"), |
| data_format=op.get_attr("data_format").decode())) |
| |
| |
| @ops.RegisterGradient("MaxPool3D") |
| def _MaxPool3DGrad(op, grad): |
| return gen_nn_ops.max_pool3d_grad( |
| op.inputs[0], |
| op.outputs[0], |
| grad, |
| ksize=op.get_attr("ksize"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format").decode()) |
| |
| |
| @ops.RegisterGradient("MaxPool3DGrad") |
| def _MaxPool3DGradGrad(op, grad): |
| return (array_ops.zeros_like(op.inputs[0]), |
| array_ops.zeros_like(op.inputs[1]), |
| gen_nn_ops.max_pool3d_grad_grad( |
| op.inputs[0], |
| op.inputs[1], |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format").decode())) |
| |
| |
| @ops.RegisterGradient("MaxPool3DGradGrad") |
| def _MaxPool3DGradGradGrad(op, grad): |
| return (array_ops.zeros_like(op.inputs[0]), |
| array_ops.zeros_like(op.inputs[1]), |
| gen_nn_ops.max_pool3d_grad( |
| op.inputs[0], |
| op.inputs[1], |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format").decode())) |
| |
| |
| @ops.RegisterGradient("Softmax") |
| def _SoftmaxGrad(op, grad_softmax): |
| """The derivative of the softmax nonlinearity. |
| |
| We assume that probs is of shape [batch_size * dim] |
| The formula for dsoftmax / dx = (diag(softmax) - softmax * softmax'). |
| This matrix is diagonal minus a rank one matrix, so it is easy to implement |
| as follows: |
| |
| grad_x = grad_softmax * softmax - sum(grad_softmax * softmax) * softmax |
| |
| Args: |
| op: the Softmax op. |
| grad_softmax: the tensor representing the gradient w.r.t. the softmax |
| output. |
| |
| Returns: |
| gradient w.r.t the input to the softmax |
| |
| """ |
| softmax = op.outputs[0] |
| sum_channels = math_ops.reduce_sum(grad_softmax * softmax, -1, keepdims=True) |
| return (grad_softmax - sum_channels) * softmax |
| |
| |
| @ops.RegisterGradient("LogSoftmax") |
| def _LogSoftmaxGrad(op, grad): |
| """The gradient for log_softmax. |
| |
| log_softmax = input - log(sum(exp(input)) |
| dlog_softmax/dinput = diag - softmax(input) |
| |
| Args: |
| op: The log softmax op. |
| grad: The tensor representing the gradient w.r.t. the output. |
| |
| Returns: |
| The gradients w.r.t. the input. |
| """ |
| softmax = math_ops.exp(op.outputs[0]) |
| return grad - math_ops.reduce_sum(grad, -1, keepdims=True) * softmax |
| |
| |
| @ops.RegisterGradient("BiasAdd") |
| def _BiasAddGrad(op, received_grad): |
| """Return the gradients for the 2 inputs of bias_op. |
| |
| The first input of unused_bias_op is the tensor t, and its gradient is |
| just the gradient the unused_bias_op received. |
| |
| The second input of unused_bias_op is the bias vector which has one fewer |
| dimension than "received_grad" (the batch dimension.) Its gradient is the |
| received gradient Summed on the batch dimension, which is the first dimension. |
| |
| Args: |
| op: The BiasOp for which we need to generate gradients. |
| received_grad: Tensor. The gradients passed to the BiasOp. |
| |
| Returns: |
| Two tensors, the first one for the "tensor" input of the BiasOp, |
| the second one for the "bias" input of the BiasOp. |
| """ |
| try: |
| data_format = op.get_attr("data_format") |
| except ValueError: |
| data_format = None |
| return (received_grad, |
| gen_nn_ops.bias_add_grad( |
| out_backprop=received_grad, data_format=data_format)) |
| |
| |
| @ops.RegisterGradient("BiasAddGrad") |
| def _BiasAddGradGrad(op, received_grad): |
| """Gradient for the BiasAddGrad op. |
| |
| Args: |
| op: BiasAddGrad op for which we are calculating gradients. |
| received_grad: The gradients passed to the BiasAddGrad op. |
| |
| Returns: |
| A single gradient Tensor for the input to BiasAddGrad (which |
| is the gradient of the bias term in BiasAdd) |
| """ |
| |
| try: |
| data_format = op.get_attr("data_format") |
| except ValueError: |
| data_format = None |
| |
| shape = array_ops.shape(op.inputs[0]) |
| bias_shape = array_ops.shape(received_grad) |
| |
| if data_format == b"NCHW": |
| expanded_shape = array_ops.concat([ |
| array_ops.ones_like(shape[:1]), bias_shape, |
| array_ops.ones_like(shape[2:]) |
| ], 0) |
| tile_mults = array_ops.concat([shape[:1], [1], shape[2:]], 0) |
| else: |
| expanded_shape = array_ops.concat( |
| [array_ops.ones_like(shape[:-1]), bias_shape], 0) |
| tile_mults = array_ops.concat([shape[:-1], [1]], 0) |
| |
| expanded_grad = array_ops.reshape(received_grad, expanded_shape) |
| return array_ops.tile(expanded_grad, tile_mults) |
| |
| |
| @ops.RegisterGradient("BiasAddV1") |
| def _BiasAddGradV1(unused_bias_op, received_grad): |
| """Return the gradients for the 2 inputs of bias_op. |
| |
| The first input of unused_bias_op is the tensor t, and its gradient is |
| just the gradient the unused_bias_op received. |
| |
| The second input of unused_bias_op is the bias vector which has one fewer |
| dimension than "received_grad" (the batch dimension.) Its gradient is the |
| received gradient Summed on the batch dimension, which is the first dimension. |
| |
| Args: |
| unused_bias_op: The BiasOp for which we need to generate gradients. |
| received_grad: Tensor. The gradients passed to the BiasOp. |
| |
| Returns: |
| Two tensors, the first one for the "tensor" input of the BiasOp, |
| the second one for the "bias" input of the BiasOp. |
| """ |
| reduction_dim_tensor = math_ops.range(array_ops.rank(received_grad) - 1) |
| return (received_grad, math_ops.reduce_sum(received_grad, |
| reduction_dim_tensor)) |
| |
| |
| @ops.RegisterGradient("Relu") |
| def _ReluGrad(op, grad): |
| return gen_nn_ops.relu_grad(grad, op.outputs[0]) |
| |
| |
| @ops.RegisterGradient("EluGrad") |
| def _EluGradGrad(op, grad): |
| elu_x = op.inputs[1] |
| return (gen_nn_ops.elu_grad(grad, elu_x), |
| array_ops.where( |
| elu_x < 0, grad * op.inputs[0], array_ops.zeros_like(elu_x))) |
| |
| |
| @ops.RegisterGradient("SeluGrad") |
| def _SeluGradGrad(op, grad): |
| selu_x = op.inputs[1] |
| return (gen_nn_ops.selu_grad(grad, selu_x), |
| array_ops.where( |
| selu_x < 0., grad * op.inputs[0], array_ops.zeros_like(selu_x))) |
| |
| |
| @ops.RegisterGradient("Relu6") |
| def _Relu6Grad(op, grad): |
| return gen_nn_ops.relu6_grad(grad, op.outputs[0]) |
| |
| |
| @ops.RegisterGradient("Relu6Grad") |
| def _Relu6GradGrad(op, grad): |
| x = op.inputs[1] |
| return (gen_nn_ops.relu6_grad(grad, x), array_ops.zeros_like(x)) |
| |
| |
| @ops.RegisterGradient("LeakyRelu") |
| def _LeakyReluGrad(op, grad): |
| x = op.inputs[0] |
| alpha = op.get_attr("alpha") |
| return gen_nn_ops.leaky_relu_grad(grad, x, alpha=alpha) |
| |
| |
| @ops.RegisterGradient("LeakyReluGrad") |
| def _LeakyReluGradGrad(op, grad): |
| x = op.inputs[1] |
| alpha = op.get_attr("alpha") |
| return (gen_nn_ops.leaky_relu_grad(grad, x, |
| alpha=alpha), array_ops.zeros_like(x)) |
| |
| |
| @ops.RegisterGradient("Elu") |
| def _EluGrad(op, grad): |
| return gen_nn_ops.elu_grad(grad, op.outputs[0]) |
| |
| |
| @ops.RegisterGradient("Selu") |
| def _SeluGrad(op, grad): |
| return gen_nn_ops.selu_grad(grad, op.outputs[0]) |
| |
| |
| @ops.RegisterGradient("Softplus") |
| def _SoftplusGrad(op, grad): |
| return grad * math_ops.sigmoid(op.inputs[0]) |
| |
| |
| @ops.RegisterGradient("SoftplusGrad") |
| def _SoftplusGradGrad(op, grad): |
| # Let: |
| # y = tf.nn.softplus(x) |
| # dx = gen_nn_ops.softplus_grad(dy, x) = dy / (1 + exp(-x)) |
| # This op computes (ddy, d2x) from op.inputs == [dy, x] and grad == ddx. |
| dy, x = op.inputs |
| with ops.control_dependencies([grad]): |
| ddy = gen_nn_ops.softplus_grad(grad, x) |
| d2x = grad * dy / (math_ops.exp(-x) + 2.0 + math_ops.exp(x)) |
| return (ddy, d2x) |
| |
| |
| @ops.RegisterGradient("Softsign") |
| def _SoftsignGrad(op, grad): |
| return gen_nn_ops.softsign_grad(grad, op.inputs[0]) |
| |
| |
| @ops.RegisterGradient("ReluGrad") |
| def _ReluGradGrad(op, grad): |
| x = op.inputs[1] |
| return (gen_nn_ops.relu_grad(grad, x), array_ops.zeros_like(x)) |
| |
| |
| def _BroadcastMul(vec, mat): |
| """Multiply after broadcasting vec to match dimensions of mat. |
| |
| Args: |
| vec: A 1-D tensor of dimension [D0] |
| mat: A 2-D tensor of dimension [D0, D1] |
| |
| Returns: |
| A tensor of dimension [D0, D1], the result of vec * mat |
| """ |
| # Reshape vec to [D0, 1] |
| vec = array_ops.expand_dims(vec, -1) |
| return vec * mat |
| |
| |
| @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits") |
| def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): |
| """Gradient function for SoftmaxCrossEntropyWithLogits.""" |
| # grad_loss is the backprop for cost, and we multiply it with the gradients |
| # (which is output[1]) |
| # grad_grad is the backprop for softmax gradient. |
| # |
| # Second derivative is just softmax derivative w.r.t. logits. |
| softmax_grad = op.outputs[1] |
| grad = _BroadcastMul(grad_loss, softmax_grad) |
| |
| logits = op.inputs[0] |
| if (grad_grad is not None and |
| not getattr(grad_grad, "_is_zeros_tensor", False)): |
| softmax = nn_ops.softmax(logits) |
| |
| grad += ((grad_grad - array_ops.squeeze( |
| math_ops.matmul( |
| array_ops.expand_dims(grad_grad, 1), |
| array_ops.expand_dims(softmax, 2)), |
| axis=1)) * softmax) |
| |
| return grad, _BroadcastMul(grad_loss, -nn_ops.log_softmax(logits)) # pylint: disable=invalid-unary-operand-type |
| |
| |
| @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits") |
| def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): |
| """Gradient function for SparseSoftmaxCrossEntropyWithLogits.""" |
| # grad_loss is the backprop for cost, and we multiply it with the gradients |
| # (which is output[1]) |
| # grad_grad is the backprop for softmax gradient. |
| # There is no gradient for the labels |
| # |
| # Second derivative is just softmax derivative w.r.t. logits. |
| softmax_grad = op.outputs[1] |
| grad = _BroadcastMul(grad_loss, softmax_grad) |
| |
| logits = op.inputs[0] |
| if (grad_grad is not None and |
| not getattr(grad_grad, "_is_zeros_tensor", False)): |
| softmax = nn_ops.softmax(logits) |
| |
| grad += ((grad_grad - array_ops.squeeze( |
| math_ops.matmul( |
| array_ops.expand_dims(grad_grad, 1), |
| array_ops.expand_dims(softmax, 2)), |
| axis=1)) * softmax) |
| |
| return grad, None |
| |
| |
| @ops.RegisterGradient("Conv2D") |
| def _Conv2DGrad(op, grad): |
| """Gradient function for Conv2D.""" |
| dilations = op.get_attr("dilations") |
| strides = op.get_attr("strides") |
| padding = op.get_attr("padding") |
| explicit_paddings = op.get_attr("explicit_paddings") |
| use_cudnn_on_gpu = op.get_attr("use_cudnn_on_gpu") |
| data_format = op.get_attr("data_format") |
| shape_0, shape_1 = array_ops.shape_n([op.inputs[0], op.inputs[1]]) |
| |
| # We call the gen_nn_ops backprop functions instead of nn_ops backprop |
| # functions for performance reasons in Eager mode. gen_nn_ops functions take a |
| # `explicit_paddings` parameter, but nn_ops functions do not. So if we were |
| # to use the nn_ops functions, we would have to convert `padding` and |
| # `explicit_paddings` into a single `padding` parameter, increasing overhead |
| # in Eager mode. |
| return [ |
| gen_nn_ops.conv2d_backprop_input( |
| shape_0, |
| op.inputs[1], |
| grad, |
| dilations=dilations, |
| strides=strides, |
| padding=padding, |
| explicit_paddings=explicit_paddings, |
| use_cudnn_on_gpu=use_cudnn_on_gpu, |
| data_format=data_format), |
| gen_nn_ops.conv2d_backprop_filter( |
| op.inputs[0], |
| shape_1, |
| grad, |
| dilations=dilations, |
| strides=strides, |
| padding=padding, |
| explicit_paddings=explicit_paddings, |
| use_cudnn_on_gpu=use_cudnn_on_gpu, |
| data_format=data_format) |
| ] |
| |
| |
| @ops.RegisterGradient("DepthwiseConv2dNative") |
| def _DepthwiseConv2dNativeGrad(op, grad): |
| return [ |
| gen_nn_ops.depthwise_conv2d_native_backprop_input( |
| array_ops.shape(op.inputs[0]), |
| op.inputs[1], |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")), |
| gen_nn_ops.depthwise_conv2d_native_backprop_filter( |
| op.inputs[0], |
| array_ops.shape(op.inputs[1]), |
| grad, |
| dilations=op.get_attr("dilations"), |
| strides=op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")) |
| ] |
| |
| |
| @ops.RegisterGradient("Dilation2D") |
| def _Dilation2DGrad(op, grad): |
| return [ |
| nn_ops.dilation2d_backprop_input(op.inputs[0], op.inputs[1], grad, |
| op.get_attr("strides"), |
| op.get_attr("rates"), |
| op.get_attr("padding")), |
| nn_ops.dilation2d_backprop_filter(op.inputs[0], op.inputs[1], grad, |
| op.get_attr("strides"), |
| op.get_attr("rates"), |
| op.get_attr("padding")) |
| ] |
| |
| |
| @ops.RegisterGradient("LRN") |
| def _LRNGrad(op, grad): |
| depth_radius = op.get_attr("depth_radius") |
| bias = op.get_attr("bias") |
| alpha = op.get_attr("alpha") |
| beta = op.get_attr("beta") |
| return [ |
| gen_nn_ops.lrn_grad(grad, op.inputs[0], op.outputs[0], depth_radius, bias, |
| alpha, beta) |
| ] |
| |
| |
| @ops.RegisterGradient("AvgPool") |
| def _AvgPoolGrad(op, grad): |
| return gen_nn_ops.avg_pool_grad( |
| array_ops.shape(op.inputs[0]), |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| op.get_attr("padding"), |
| data_format=op.get_attr("data_format")) |
| |
| |
| @ops.RegisterGradient("AvgPoolGrad") |
| def _AvgPoolGradGrad(op, grad): |
| return (array_ops.stop_gradient(op.inputs[0]), |
| gen_nn_ops.avg_pool( |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| op.get_attr("padding"), |
| data_format=op.get_attr("data_format"))) |
| |
| |
| @ops.RegisterGradient("MaxPool") |
| def _MaxPoolGrad(op, grad): |
| return gen_nn_ops.max_pool_grad( |
| op.inputs[0], |
| op.outputs[0], |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| explicit_paddings=op.get_attr("explicit_paddings"), |
| data_format=op.get_attr("data_format")) |
| |
| |
| @ops.RegisterGradient("MaxPoolV2") |
| def _MaxPoolGradV2(op, grad): |
| ksize = op.inputs[1] |
| strides = op.inputs[2] |
| return gen_nn_ops.max_pool_grad_v2( |
| op.inputs[0], |
| op.outputs[0], |
| grad, |
| ksize, |
| strides, |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format")), None, None |
| |
| |
| @ops.RegisterGradient("MaxPoolWithArgmax") |
| def _MaxPoolGradWithArgmax(op, grad, unused_argmax_grad): |
| del unused_argmax_grad |
| return gen_nn_ops.max_pool_grad_with_argmax( |
| op.inputs[0], |
| grad, |
| op.outputs[1], |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| include_batch_in_index=op.get_attr("include_batch_in_index")) |
| |
| |
| @ops.RegisterGradient("MaxPoolGrad") |
| def _MaxPoolGradGrad(op, grad): |
| return (array_ops.zeros_like(op.inputs[0]), |
| array_ops.zeros_like(op.inputs[1]), |
| gen_nn_ops.max_pool_grad_grad( |
| op.inputs[0], |
| op.inputs[1], |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format"))) |
| |
| |
| @ops.RegisterGradient("MaxPoolGradV2") |
| def _MaxPoolGradGradV2(op, grad): |
| ksize = op.inputs[3] |
| strides = op.inputs[4] |
| return (array_ops.zeros_like(op.inputs[0]), |
| array_ops.zeros_like(op.inputs[1]), |
| gen_nn_ops.max_pool_grad_grad_v2( |
| op.inputs[0], |
| op.inputs[1], |
| grad, |
| ksize, |
| strides, |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format")), None, None) |
| |
| |
| @ops.RegisterGradient("MaxPoolGradGrad") |
| def _MaxPoolGradGradGrad(op, grad): |
| return (array_ops.zeros_like(op.inputs[0]), |
| array_ops.zeros_like(op.inputs[1]), |
| gen_nn_ops.max_pool_grad( |
| op.inputs[0], |
| op.inputs[1], |
| grad, |
| op.get_attr("ksize"), |
| op.get_attr("strides"), |
| padding=op.get_attr("padding"), |
| data_format=op.get_attr("data_format"))) |
| |
| |
| @ops.RegisterGradient("FractionalMaxPool") |
| def _FractionalMaxPoolGrad(op, grad_0, unused_grad_1, unused_grad_2): |
| """Returns gradient for FractionalMaxPool. |
| |
| Since FractionalMaxPool has three outputs, there are three gradients passed in |
| for each of the outputs. Only the first one is useful, the other two gradients |
| are empty. |
| |
| Args: |
| op: The FractionalMaxPoolOp. |
| grad_0: Gradient with respect to op.outputs[0] |
| unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty. |
| unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty. |
| |
| Returns: |
| Input backprop for FractionalMaxPool op. |
| """ |
| return gen_nn_ops.fractional_max_pool_grad( |
| op.inputs[0], op.outputs[0], grad_0, op.outputs[1], op.outputs[2], |
| op.get_attr("overlapping")) |
| |
| |
| @ops.RegisterGradient("FractionalAvgPool") |
| def _FractionalAvgPoolGrad(op, grad_0, unused_grad_1, unused_grad_2): |
| """Returns gradient for FractionalAvgPool. |
| |
| Since FractionalAvgPool has three outputs, there are three gradients passed in |
| for each of the outputs. Only the first one is useful, the other two gradients |
| are empty. |
| |
| Args: |
| op: The FractionalAvgPoolOp. |
| grad_0: Gradient with respect to op.outputs[0] |
| unused_grad_1: Gradient with respect to op.outputs[1]/row_seq. It is empty. |
| unused_grad_2: Gradient with respect to op.outputs[2]/col_seq. It is empty. |
| |
| Returns: |
| Input backprop for FractionalAvgPool op. |
| """ |
| return gen_nn_ops.fractional_avg_pool_grad(op.inputs[0].get_shape(), grad_0, |
| op.outputs[1], op.outputs[2], |
| op.get_attr("overlapping")) |
| |
| |
| @ops.RegisterGradient("BatchNormWithGlobalNormalization") |
| def _BatchNormWithGlobalNormalizationGrad(op, grad): |
| """Return the gradients for the 5 inputs of BatchNormWithGlobalNormalization. |
| |
| We do not backprop anything for the mean and var intentionally as they are |
| not being trained with backprop in the operation. |
| |
| Args: |
| op: The BatchNormOp for which we need to generate gradients. |
| grad: Tensor. The gradients passed to the BatchNormOp. |
| |
| Returns: |
| dx: Backprop for input, which is (grad * (g * rsqrt(v + epsilon))) |
| dm: Backprop for mean, which is |
| sum_over_rest(grad * g) * (-1 / rsqrt(v + epsilon)) |
| dv: Backprop for variance, which is |
| sum_over_rest(grad * g * (x - m)) * (-1/2) * (v + epsilon) ^ (-3/2) |
| db: Backprop for beta, which is grad reduced in all except the |
| last dimension. |
| dg: Backprop for gamma, which is (grad * ((x - m) * rsqrt(v + epsilon))) |
| """ |
| dx, dm, dv, db, dg = gen_nn_ops.batch_norm_with_global_normalization_grad( |
| op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[4], grad, |
| op.get_attr("variance_epsilon"), op.get_attr("scale_after_normalization")) |
| return dx, dm, dv, db, dg |
| |
| |
| def _BaseFusedBatchNormGrad(op, version, *grad): |
| """Return the gradients for the 3 inputs of BatchNorm. |
| |
| Args: |
| op: The BatchNormOp for which we need to compute gradients. |
| version: Integer indicating which version to use of the fused batch |
| norm gradient. |
| *grad: An argument list for tensors of gradients wrt the outputs |
| with grad[0] as grad_y. |
| |
| Returns: |
| grad_x: gradient for x, which is scale * rsqrt(variance + epsilon) * |
| [grad_y - mean(grad_y) - (x - mean(x)) * |
| mean(grad_y * (x - mean(x))) / (variance + epsilon)] |
| in training mode; grad_y * scale * rsqrt(pop_variance + epsilon) |
| in freeze mode. |
| |
| grad_scale: gradient for scale, which is sum(grad_y * (x - mean(x)) * |
| rsqrt(variance + epsilon)) in training mode; |
| sum(grad_y * (x - pop_mean) * rsqrt(pop_variance + epsilon)) |
| in freeze mode. |
| |
| grad_offset: gradient for offset, which is sum(grad_y) in training mode; |
| sum(grad_y) in freeze mode. |
| """ |
| x = op.inputs[0] |
| grad_y = grad[0] |
| scale = op.inputs[1] |
| epsilon = op.get_attr("epsilon") |
| data_format = op.get_attr("data_format") |
| is_training = op.get_attr("is_training") |
| if version == 2: |
| grad_fun = gen_nn_ops.fused_batch_norm_grad_v3 |
| elif version == 1: |
| grad_fun = gen_nn_ops.fused_batch_norm_grad_v2 |
| else: |
| grad_fun = gen_nn_ops.fused_batch_norm_grad |
| if is_training: |
| args = { |
| "y_backprop": grad_y, |
| "x": x, |
| "scale": scale, |
| "reserve_space_1": op.outputs[3], |
| "reserve_space_2": op.outputs[4], |
| "epsilon": epsilon, |
| "data_format": data_format, |
| "is_training": is_training |
| } |
| if version == 2: |
| args["reserve_space_3"] = op.outputs[5] |
| dx, dscale, doffset, _, _ = grad_fun(**args) |
| else: |
| pop_mean = op.inputs[3] |
| pop_var = op.inputs[4] |
| if data_format == b"NCHW": |
| x = array_ops.transpose(x, [0, 2, 3, 1]) |
| grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1]) |
| elif data_format == b"NCDHW": |
| x = array_ops.transpose(x, [0, 2, 3, 4, 1]) |
| grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1]) |
| target_data_format = ("NHWC" if data_format in (b"NCHW", |
| b"NHWC") else "NDHWC") |
| args = { |
| "y_backprop": grad_y, |
| "x": x, |
| "scale": scale, |
| "reserve_space_1": pop_mean, |
| "reserve_space_2": pop_var, |
| "epsilon": epsilon, |
| "data_format": target_data_format, |
| "is_training": is_training |
| } |
| if version == 2: |
| args["reserve_space_3"] = op.outputs[5] |
| dx, dscale, doffset, _, _ = grad_fun(**args) |
| if data_format == b"NCHW": |
| dx = array_ops.transpose(dx, [0, 3, 1, 2]) |
| elif data_format == b"NCDHW": |
| dx = array_ops.transpose(dx, [0, 4, 1, 2, 3]) |
| return dx, dscale, doffset, None, None |
| |
| |
| @ops.RegisterGradient("FusedBatchNorm") |
| def _FusedBatchNormGrad(op, *grad): |
| return _BaseFusedBatchNormGrad(op, 0, *grad) |
| |
| |
| @ops.RegisterGradient("FusedBatchNormV2") |
| def _FusedBatchNormV2Grad(op, *grad): |
| return _BaseFusedBatchNormGrad(op, 1, *grad) |
| |
| |
| @ops.RegisterGradient("FusedBatchNormV3") |
| def _FusedBatchNormV3Grad(op, *grad): |
| return _BaseFusedBatchNormGrad(op, 2, *grad) |
| |
| |
| def _BatchNormGrad(grad_y, |
| x, |
| scale, |
| pop_mean, |
| pop_var, |
| epsilon, |
| data_format, |
| is_training=True): |
| """Returns the gradients for the 3 inputs of BatchNorm. |
| |
| Args: |
| grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y. |
| x: A `Tensor` of 4 or 5 dimensions for x. |
| scale: A `Tensor` of 1 dimension for scaling. |
| pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when |
| is_training=False. |
| pop_var: A `Tensor` of 1 dimension for the population variance. Only used |
| when is_training=False. |
| epsilon: A small float number added to the variance of x. |
| data_format: The data format for input. Either b"NHWC" or b"NCHW". |
| is_training: A bool value to indicate the operation is for training |
| (default) or inference. |
| |
| Returns: |
| A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient |
| for x, grad_scale the gradient for scale, and grad_offset the gradient |
| for offset. |
| """ |
| x_dtype = x.dtype.base_dtype |
| if x_dtype == dtypes.float16 or x_dtype == dtypes.bfloat16: |
| # float16 math is too imprecise, so we do the batch norm gradient |
| # computations in float32. |
| x = math_ops.cast(x, dtypes.float32) |
| grad_y = math_ops.cast(grad_y, dtypes.float32) |
| if is_training: |
| if data_format == b"NHWC": |
| keepdims = False |
| reduce_axis = [0, 1, 2] |
| elif data_format == b"NDHWC": |
| keepdims = False |
| reduce_axis = [0, 1, 2, 3] |
| elif data_format == b"NCHW": |
| keepdims = True |
| reduce_axis = [0, 2, 3] |
| shape = [1, array_ops.size(scale), 1, 1] |
| scale = array_ops.reshape(scale, shape) |
| else: |
| keepdims = True |
| reduce_axis = [0, 2, 3, 4] |
| shape = [1, array_ops.size(scale), 1, 1, 1] |
| scale = array_ops.reshape(scale, shape) |
| mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims) |
| mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims) |
| var_x = math_ops.reduce_mean( |
| math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), |
| reduce_axis, |
| keepdims=keepdims) |
| grad_y_offset = grad_y - mean_grad_y |
| x_offset = x - mean_x |
| mean = math_ops.reduce_mean( |
| grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) |
| grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( |
| grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) |
| grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( |
| grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) |
| if data_format == b"NCHW" or data_format == b"NCDHW": |
| grad_scale = array_ops.squeeze(grad_scale) |
| grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) |
| return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset |
| else: |
| if data_format == b"NHWC": |
| reduce_axis = [0, 1, 2] |
| elif data_format == b"NDHWC": |
| reduce_axis = [0, 1, 2, 3] |
| elif data_format == b"NCHW": |
| reduce_axis = [0, 2, 3] |
| shape = [1, array_ops.size(pop_mean), 1, 1] |
| pop_mean = array_ops.reshape(pop_mean, shape) |
| pop_var = array_ops.reshape(pop_var, shape) |
| scale = array_ops.reshape(scale, shape) |
| else: |
| reduce_axis = [0, 2, 3, 4] |
| shape = [1, array_ops.size(pop_mean), 1, 1, 1] |
| pop_mean = array_ops.reshape(pop_mean, shape) |
| pop_var = array_ops.reshape(pop_var, shape) |
| scale = array_ops.reshape(scale, shape) |
| |
| grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) |
| var_rsqrt = math_ops.rsqrt(pop_var + epsilon) |
| grad_scale = math_ops.reduce_sum( |
| grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) |
| grad_x = grad_y * scale * var_rsqrt |
| return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset |
| |
| |
| @ops.RegisterGradient("FusedBatchNormGrad") |
| def _FusedBatchNormGradGrad(op, *grad): |
| """Returns the gradients for the 3 inputs of FusedBatchNormGrad. |
| |
| Args: |
| op: The FusedBatchNormGradOp for which we need to compute gradients. |
| *grad: An argument list for tensors of gradients wrt the outputs with |
| grad[0] as grad_grad_x, grad[1] as grad_grad_scale, grad[2] as |
| grad_grad_offset. |
| |
| Returns: |
| A tuple (grad_grad_y, grad_x, grad_scale, None, None), where grad_grad_y |
| is the gradient for grad_y, grad_x the gradient for x, grad_scale the |
| gradient for scale. |
| """ |
| data_format = op.get_attr("data_format") |
| epsilon = op.get_attr("epsilon") |
| is_training = op.get_attr("is_training") |
| grad_y = op.inputs[0] |
| x = op.inputs[1] |
| scale = op.inputs[2] |
| pop_mean = op.inputs[3] |
| pop_var = op.inputs[4] |
| grad_grad_x = grad[0] |
| grad_grad_scale = grad[1] |
| grad_grad_offset = grad[2] |
| with backprop.GradientTape() as tape: |
| tape.watch(grad_y) |
| tape.watch(x) |
| tape.watch(scale) |
| grad_x, grad_scale, grad_offset = _BatchNormGrad( |
| grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training) |
| grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset] |
| grad_grad_y, grad_x, grad_scale = tape.gradient( |
| [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial) |
| return grad_grad_y, grad_x, grad_scale, None, None |
| |
| |
| @ops.RegisterGradient("FusedBatchNormGradV2") |
| def _FusedBatchNormGradGradV2(op, *grad): |
| return _FusedBatchNormGradGrad(op, *grad) |
| |
| |
| @ops.RegisterGradient("FusedBatchNormGradV3") |
| def _FusedBatchNormGradGradV3(op, *grad): |
| grad_grad_y, grad_x, grad_scale, _, _ = _FusedBatchNormGradGrad(op, *grad) |
| return grad_grad_y, grad_x, grad_scale, None, None, None |
| |
| |
| @ops.RegisterGradient("L2Loss") |
| def _L2LossGrad(op, grad): |
| """Return the gradients for L2Loss. |
| |
| Args: |
| op: The L2LossOp for which we need to generate gradients. |
| grad: Tensor containing a single number. |
| |
| Returns: |
| The gradient, which is (x * grad). |
| """ |
| return op.inputs[0] * grad |
| |
| |
| @ops.RegisterGradient("TopK") |
| @ops.RegisterGradient("TopKV2") |
| def _TopKGrad(op, grad, _): |
| """Return the gradients for TopK. |
| |
| Args: |
| op: The TopKOp for which we need to generate gradients. |
| grad: Tensor. The gradients passed to the TopKOp. |
| |
| Returns: |
| A list of two tensors, the first being the gradient w.r.t to the input and |
| TopK, and the second being the gradient w.r.t. to the indices (all zero). |
| """ |
| in_shape = array_ops.shape(op.inputs[0]) |
| ind_shape = array_ops.shape(op.outputs[1]) |
| |
| # int32 is not supported on GPU hence up-casting |
| ind_lastdim = array_ops.gather( |
| math_ops.cast(ind_shape, dtypes.int64), |
| array_ops.size(ind_shape) - 1) |
| # Flatten indices to 2D. |
| ind_2d = array_ops.reshape( |
| op.outputs[1], array_ops_stack.stack([-1, ind_lastdim])) |
| |
| in_lastdim = array_ops.gather( |
| math_ops.cast(in_shape, dtypes.int64), |
| array_ops.size(in_shape) - 1) |
| outerdim = array_ops.shape(ind_2d)[0] |
| # Compute linear indices (flattened to 1D). |
| ind = array_ops.reshape( |
| ind_2d + math_ops.cast( |
| array_ops.expand_dims( |
| math_ops.range(0, |
| math_ops.cast(outerdim, dtypes.int64) * in_lastdim, |
| in_lastdim), -1), dtypes.int32), [-1]) |
| |
| # Substitute grad to appropriate locations and fill the rest with zeros, |
| # finally reshaping it to the original input shape. |
| return [ |
| array_ops.reshape( |
| array_ops.scatter_nd( |
| array_ops.expand_dims(ind, -1), array_ops.reshape(grad, [-1]), |
| [math_ops.reduce_prod(in_shape)]), in_shape), |
| array_ops.zeros([], dtype=dtypes.int32) |
| ] |
| |
| |
| @ops.RegisterGradient("ApproxTopK") |
| def _ApproxTopKGradient(op, grad, _): |
| """Return the gradients for ApproxTopK. |
| |
| Args: |
| op: The ApproxTopK for which we need to generate gradients. |
| grad: The gradients for backprop. |
| |
| Returns: |
| Scattered gradient based on the top-k indices. |
| """ |
| # The code below is to generate the correct index and value mapping for |
| # scatter_nd to work properly. |
| # |
| # We use static evaluations as much as possible to reduce the runtime cost. |
| # That's said, use operation.shape instead of array_ops.shape; |
| # and use functools.reduce(operator.mul, ...) instead of math_ops.reduce_prod |
| idx_shape = op.outputs[1].shape |
| lifted_idx_shape = idx_shape + [1] |
| flat_shape_len = functools.reduce(operator.mul, idx_shape) |
| rank = idx_shape.rank |
| reduction_dim = op.get_attr("reduction_dimension") |
| if reduction_dim < 0: |
| reduction_dim = rank + reduction_dim |
| |
| def GetLiftedIdx(d): |
| if d == reduction_dim: |
| return array_ops.reshape(op.outputs[1], lifted_idx_shape) |
| iota_len = idx_shape[d] |
| iota_shape = list(itertools.repeat(1, rank + 1)) |
| iota_shape[d] = iota_len |
| iota = array_ops.reshape(math_ops.range(iota_len), iota_shape) |
| return array_ops.broadcast_to(iota, lifted_idx_shape) |
| |
| lifted_idx = array_ops.concat( |
| list(GetLiftedIdx(d) for d in range(rank)), axis=rank) |
| flat_idx = array_ops.reshape(lifted_idx, [flat_shape_len, rank]) |
| flat_grad = array_ops.reshape(grad, [flat_shape_len]) |
| return array_ops.scatter_nd(flat_idx, flat_grad, op.inputs[0].shape) |
| |
| |
| @ops.RegisterGradient("NthElement") |
| def _NthElementGrad(op, grad): |
| """Return the gradients for NthElement. |
| |
| Args: |
| op: The NthElementOp for which we need to generate gradients. |
| grad: Tensor. The gradients passed to the NthElementOp |
| |
| Returns: |
| A list of two tensors, the first being the gradient w.r.t. the input, |
| the second being the gradient w.r.t. the N (None). |
| """ |
| input = op.inputs[0] # pylint: disable=redefined-builtin |
| output = op.outputs[0] |
| |
| # Compute the number of elements which equal to output in each reduction |
| # dimension. If there are multiple elements then the gradient will be |
| # divided between them. |
| indicators = math_ops.cast( |
| math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype) |
| |
| grad = array_ops.expand_dims(grad, -1) |
| num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1) |
| |
| return [math_ops.divide(indicators, num_selected) * grad, None] |
| |
| |
| def _MeanAggregator(inputs, segments): |
| """Replaces each segment with its mean along the last axis. |
| |
| Specifically, each value in the `inputs` tensor gets replaced by the mean |
| value computed from the values that belong to the same segment. |
| |
| Args: |
| inputs: A 2-tensor. Aggregation is done over dimension 1. |
| segments: A 2-tensor, same shape as `input`. |
| |
| Returns: |
| The result, same shape and type as `inputs`. |
| """ |
| result = [] |
| for inputs_i, segments_i in zip( |
| array_ops.split(inputs, inputs.shape[0]), |
| array_ops.split(segments, segments.shape[0])): |
| # Note that we do not use tf.math.segment_mean, as it has no TPU support. |
| means_i = math_ops.unsorted_segment_mean( |
| inputs_i, segments_i, num_segments=math_ops.reduce_max(segments_i) + 1) |
| result.append( |
| array_ops.reshape(array_ops.gather(means_i, segments_i), [-1])) |
| return array_ops_stack.stack(result, axis=0) |
| |
| |
| # We have to register the gradients for these ops so that tensorflow will know |
| # how to differentiate them. |
| @ops.RegisterGradient("IsotonicRegression") |
| def _IsotonicRegressionGrad(op, grad_output, grad_segments): |
| """Gradient for the isotonic regression function. |
| |
| Args: |
| op: The IsotonicRegression tensorflow op. |
| grad_output: Tensor of incoming gradients with respect to the output. |
| grad_segments: Tensor of incoming gradients with respect to the segments. |
| |
| Returns: |
| A tensor, same size as `grad_output` with the gradient with respect to |
| the input. |
| """ |
| del grad_segments # Discrete, non-differentiable. |
| segments = op.outputs[1] |
| return _MeanAggregator(grad_output, segments) |