tensorflow/compiler/tests/approx_topk_test.py - third_party/github.com/tensorflow/tensorflow - Git at Google

 # Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
 """Tests for approx_max_k and approx_min_k."""

 import itertools
 from absl.testing import parameterized

 import numpy as np

 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import test
 from tensorflow.python.eager.def_function import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables


 class ApproxTopkTest(test.TestCase, parameterized.TestCase):

   def setUp(self):
     test.TestCase.setUp(self)
     self._rng = np.random.default_rng(42)

   def compute_recall(self, result_neighbors, ground_truth_neighbors):
     """Computes the recall of an approximate nearest neighbor search.

     Args:
       result_neighbors: int32 numpy array of the shape [num_queries,
         neighbors_per_query] where the values are the indices of the dataset.
       ground_truth_neighbors: int32 numpy array of with shape [num_queries,
         ground_truth_neighbors_per_query] where the values are the indices of
         the dataset.

     Returns:
       The recall.
     """
     self.assertLen(result_neighbors.shape, 2)
     self.assertLen(ground_truth_neighbors.shape, 2)
     self.assertEqual(result_neighbors.shape[0], ground_truth_neighbors.shape[0])
     gt_sets = [set(np.asarray(x)) for x in ground_truth_neighbors]

     def hits_per_q(q, nn_per_q):
       return len(list(x for x in nn_per_q if x.item() in gt_sets[q]))

     hits = sum(
         hits_per_q(q, nn_per_q) for q, nn_per_q in enumerate(result_neighbors))
     return hits / ground_truth_neighbors.size

   @parameterized.parameters(
       itertools.product(
           [1, 10],  # k
           [100, 500],  # row_size
           [1, 10, 128],  # num_rows
           [True, False],  # aggregate_to_topk
       ))
   def test_non_fused_max_k(self, k, row_size, num_rows, aggregate_to_topk):
     row = np.arange(row_size, dtype=np.float32)
     db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

     @function(jit_compile=True)
     def ann(db, k):
       return nn_ops.approx_max_k(db, k, aggregate_to_topk=aggregate_to_topk)

     with ops.device('/device:TPU:0'):
       db_op = variables.Variable(db)
       result = ann(db_op, k)[1]

     gt = np.argsort(-db)[:, :k]
     ann_recall = self.compute_recall(result.numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

   @parameterized.parameters(
       itertools.product(
           [1, 10],  # k
           [100, 500],  # row_size
           [1, 10, 128],  # num_rows
           [True, False],  # aggregate_to_topk
       ))
   def test_non_fused_min_k(self, k, row_size, num_rows, aggregate_to_topk):
     # Use the new rng api
     row = np.arange(row_size, dtype=np.float32)
     db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

     @function(jit_compile=True)
     def ann(db, k=10):
       return nn_ops.approx_min_k(db, k, aggregate_to_topk=aggregate_to_topk)

     with ops.device('/device:TPU:0'):
       db_op = variables.Variable(db)
       result = ann(db_op, k)[1]

     gt = np.argsort(db)[:, :k]
     ann_recall = self.compute_recall(result.numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

   @parameterized.parameters(
       itertools.product(
           [1, 10],  # k
           [100, 500],  # db_size
           [1, 10, 128],  # qy_size
           [2, 32],  # feature dim
       ))
   # MIPS = Maximal Inner Product Search
   def test_mips(self, k, db_size, qy_size, feature_dim):
     qy = self._rng.random([qy_size, feature_dim], dtype=np.float32)
     db = self._rng.random([db_size, feature_dim], dtype=np.float32)

     @function(jit_compile=True)
     def ann(qy, db, k):
       scores = math_ops.matmul(qy, db, transpose_b=True)
       return nn_ops.approx_max_k(scores, k)

     with ops.device('/device:TPU:0'):
       qy_op = variables.Variable(qy)
       db_op = variables.Variable(db)
       result = ann(qy_op, db_op, k)[1]
       scores = -math_ops.matmul(qy_op, db_op, transpose_b=True)

     gt = np.argsort(scores.numpy())[:, :k]
     ann_recall = self.compute_recall(result.numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

   @parameterized.parameters(
       itertools.product(
           [1, 10],  # k
           [100, 500],  # db_size
           [10, 128],  # qy_size
           [2, 8],  # feature dim
       ))
   # L2ANN = Approximate Nearest Neighbor search in the L2 metric space
   def test_l2ann(self, k, db_size, qy_size, feature_dim):
     qy = self._rng.random([qy_size, feature_dim], dtype=np.float32)
     db = self._rng.random([db_size, feature_dim], dtype=np.float32)
     db_half_norm_sq = np.linalg.norm(db, axis=1)**2 / 2

     @function(jit_compile=True)
     def ann(qy, db, db_half_norm_sq, k):
       scores = db_half_norm_sq - math_ops.matmul(qy, db, transpose_b=True)
       return nn_ops.approx_min_k(scores, k)

     with ops.device('/device:TPU:0'):
       qy_op = variables.Variable(qy)
       db_op = variables.Variable(db)
       db_half_norm_sq_op = variables.Variable(db_half_norm_sq)
       result = ann(qy_op, db_op, db_half_norm_sq_op, k)[1]
       scores = db_half_norm_sq_op - math_ops.matmul(
           qy_op, db_op, transpose_b=True)

     gt = np.argsort(scores.numpy())[:, :k]
     ann_recall = self.compute_recall(result.numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

   def test_highdim(self):
     db = self._rng.random([2, 10, 200, 3], dtype=np.float32)
     k = 5

     @function(jit_compile=True)
     def ann(db, k):
       return nn_ops.approx_min_k(db, k=k, reduction_dimension=2)

     with ops.device('/device:TPU:0'):
       db_op = variables.Variable(db)
       result = ann(db_op, k)[1]

     gt = np.argsort(db, axis=2)[:, :, :k, :]
     flat_idx = np.reshape(
         np.transpose(result.numpy(), [0, 1, 3, 2]), [2 * 10 * 3, k])
     flat_gt = np.reshape(np.transpose(gt, [0, 1, 3, 2]), [2 * 10 * 3, k])
     ann_recall = self.compute_recall(flat_idx, flat_gt)
     self.assertGreaterEqual(ann_recall, 0.95)

   @parameterized.parameters(
       itertools.product(
           [dtypes.bfloat16, dtypes.float16, dtypes.float32],
           [1, 10],  # k
           [100, 500],  # row_size
           [1, 10, 128],  # num_rows
       ))
   def test_gradients(self, dtype, k, row_size, num_rows):
     row = np.arange(row_size, dtype=np.float32)
     db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
     out_grads = self._rng.random([num_rows, k])

     @function(jit_compile=True)
     def ann_with_grads(db, out_grads):
       with backprop.GradientTape() as tape:
         val, idx = nn_ops.approx_max_k(db, k)
       result_in_grads = tape.gradient(val, db, out_grads)
       lifted_k_idx = array_ops.reshape(idx, [num_rows, k, 1])
       iota_idx = array_ops.broadcast_to(
           array_ops.reshape(math_ops.range(num_rows), [num_rows, 1, 1]),
           [num_rows, k, 1])
       lifted_idx = array_ops.concat([iota_idx, lifted_k_idx], axis=2)
       k_idx_s = array_ops.reshape(lifted_idx, [num_rows * k, 2])
       k_gra_s = array_ops.reshape(out_grads, [num_rows * k])
       expected_in_grads = array_ops.scatter_nd(k_idx_s, k_gra_s,
                                                [num_rows, row_size])
       return [expected_in_grads, result_in_grads]

     with ops.device('/device:TPU:0'):
       db_op = variables.Variable(db, dtype=dtype)
       out_grads_op = variables.Variable(out_grads, dtype=dtype)
       expected_in_grads, result_in_grads = ann_with_grads(db_op, out_grads_op)

     self.assertAllClose(expected_in_grads, result_in_grads)

   # Tests that multiple ops are supported and the comparison functions are
   # renamed properly to avoid conflict while using the MLIR bridge.
   def test_multiple_ops(self):
     k = 1

     row_size = 100
     num_rows = 10

     row = np.arange(row_size, dtype=np.float32)
     db1 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
     db2 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

     @function(jit_compile=True)
     def ann(db1, db2):
       result1 = nn_ops.approx_max_k(db1, k, aggregate_to_topk=True)
       result2 = nn_ops.approx_max_k(db2, k, aggregate_to_topk=True)
       return (result1, result2)

     with ops.device('/device:TPU:0'):
       db1_op = variables.Variable(db1)
       db2_op = variables.Variable(db2)
       result1, result2 = ann(db1_op, db2_op)

     gt = np.argsort(-db1)[:, :k]
     ann_recall = self.compute_recall(result1[1].numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

     gt = np.argsort(-db2)[:, :k]
     ann_recall = self.compute_recall(result2[1].numpy(), gt)
     self.assertGreaterEqual(ann_recall, 0.95)

 if __name__ == '__main__':
   test.main()
	# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# ==============================================================================
	"""Tests for approx_max_k and approx_min_k."""

	import itertools
	from absl.testing import parameterized

	import numpy as np

	from tensorflow.python.eager import backprop
	from tensorflow.python.eager import test
	from tensorflow.python.eager.def_function import function
	from tensorflow.python.framework import dtypes
	from tensorflow.python.framework import ops
	from tensorflow.python.ops import array_ops
	from tensorflow.python.ops import math_ops
	from tensorflow.python.ops import nn_ops
	from tensorflow.python.ops import variables


	class ApproxTopkTest(test.TestCase, parameterized.TestCase):

	def setUp(self):
	test.TestCase.setUp(self)
	self._rng = np.random.default_rng(42)

	def compute_recall(self, result_neighbors, ground_truth_neighbors):
	"""Computes the recall of an approximate nearest neighbor search.

	Args:
	result_neighbors: int32 numpy array of the shape [num_queries,
	neighbors_per_query] where the values are the indices of the dataset.
	ground_truth_neighbors: int32 numpy array of with shape [num_queries,
	ground_truth_neighbors_per_query] where the values are the indices of
	the dataset.

	Returns:
	The recall.
	"""
	self.assertLen(result_neighbors.shape, 2)
	self.assertLen(ground_truth_neighbors.shape, 2)
	self.assertEqual(result_neighbors.shape[0], ground_truth_neighbors.shape[0])
	gt_sets = [set(np.asarray(x)) for x in ground_truth_neighbors]

	def hits_per_q(q, nn_per_q):
	return len(list(x for x in nn_per_q if x.item() in gt_sets[q]))

	hits = sum(
	hits_per_q(q, nn_per_q) for q, nn_per_q in enumerate(result_neighbors))
	return hits / ground_truth_neighbors.size

	@parameterized.parameters(
	itertools.product(
	[1, 10], # k
	[100, 500], # row_size
	[1, 10, 128], # num_rows
	[True, False], # aggregate_to_topk
	))
	def test_non_fused_max_k(self, k, row_size, num_rows, aggregate_to_topk):
	row = np.arange(row_size, dtype=np.float32)
	db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

	@function(jit_compile=True)
	def ann(db, k):
	return nn_ops.approx_max_k(db, k, aggregate_to_topk=aggregate_to_topk)

	with ops.device('/device:TPU:0'):
	db_op = variables.Variable(db)
	result = ann(db_op, k)[1]

	gt = np.argsort(-db)[:, :k]
	ann_recall = self.compute_recall(result.numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	@parameterized.parameters(
	itertools.product(
	[1, 10], # k
	[100, 500], # row_size
	[1, 10, 128], # num_rows
	[True, False], # aggregate_to_topk
	))
	def test_non_fused_min_k(self, k, row_size, num_rows, aggregate_to_topk):
	# Use the new rng api
	row = np.arange(row_size, dtype=np.float32)
	db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

	@function(jit_compile=True)
	def ann(db, k=10):
	return nn_ops.approx_min_k(db, k, aggregate_to_topk=aggregate_to_topk)

	with ops.device('/device:TPU:0'):
	db_op = variables.Variable(db)
	result = ann(db_op, k)[1]

	gt = np.argsort(db)[:, :k]
	ann_recall = self.compute_recall(result.numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	@parameterized.parameters(
	itertools.product(
	[1, 10], # k
	[100, 500], # db_size
	[1, 10, 128], # qy_size
	[2, 32], # feature dim
	))
	# MIPS = Maximal Inner Product Search
	def test_mips(self, k, db_size, qy_size, feature_dim):
	qy = self._rng.random([qy_size, feature_dim], dtype=np.float32)
	db = self._rng.random([db_size, feature_dim], dtype=np.float32)

	@function(jit_compile=True)
	def ann(qy, db, k):
	scores = math_ops.matmul(qy, db, transpose_b=True)
	return nn_ops.approx_max_k(scores, k)

	with ops.device('/device:TPU:0'):
	qy_op = variables.Variable(qy)
	db_op = variables.Variable(db)
	result = ann(qy_op, db_op, k)[1]
	scores = -math_ops.matmul(qy_op, db_op, transpose_b=True)

	gt = np.argsort(scores.numpy())[:, :k]
	ann_recall = self.compute_recall(result.numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	@parameterized.parameters(
	itertools.product(
	[1, 10], # k
	[100, 500], # db_size
	[10, 128], # qy_size
	[2, 8], # feature dim
	))
	# L2ANN = Approximate Nearest Neighbor search in the L2 metric space
	def test_l2ann(self, k, db_size, qy_size, feature_dim):
	qy = self._rng.random([qy_size, feature_dim], dtype=np.float32)
	db = self._rng.random([db_size, feature_dim], dtype=np.float32)
	db_half_norm_sq = np.linalg.norm(db, axis=1)**2 / 2

	@function(jit_compile=True)
	def ann(qy, db, db_half_norm_sq, k):
	scores = db_half_norm_sq - math_ops.matmul(qy, db, transpose_b=True)
	return nn_ops.approx_min_k(scores, k)

	with ops.device('/device:TPU:0'):
	qy_op = variables.Variable(qy)
	db_op = variables.Variable(db)
	db_half_norm_sq_op = variables.Variable(db_half_norm_sq)
	result = ann(qy_op, db_op, db_half_norm_sq_op, k)[1]
	scores = db_half_norm_sq_op - math_ops.matmul(
	qy_op, db_op, transpose_b=True)

	gt = np.argsort(scores.numpy())[:, :k]
	ann_recall = self.compute_recall(result.numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	def test_highdim(self):
	db = self._rng.random([2, 10, 200, 3], dtype=np.float32)
	k = 5

	@function(jit_compile=True)
	def ann(db, k):
	return nn_ops.approx_min_k(db, k=k, reduction_dimension=2)

	with ops.device('/device:TPU:0'):
	db_op = variables.Variable(db)
	result = ann(db_op, k)[1]

	gt = np.argsort(db, axis=2)[:, :, :k, :]
	flat_idx = np.reshape(
	np.transpose(result.numpy(), [0, 1, 3, 2]), [2 * 10 * 3, k])
	flat_gt = np.reshape(np.transpose(gt, [0, 1, 3, 2]), [2 * 10 * 3, k])
	ann_recall = self.compute_recall(flat_idx, flat_gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	@parameterized.parameters(
	itertools.product(
	[dtypes.bfloat16, dtypes.float16, dtypes.float32],
	[1, 10], # k
	[100, 500], # row_size
	[1, 10, 128], # num_rows
	))
	def test_gradients(self, dtype, k, row_size, num_rows):
	row = np.arange(row_size, dtype=np.float32)
	db = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
	out_grads = self._rng.random([num_rows, k])

	@function(jit_compile=True)
	def ann_with_grads(db, out_grads):
	with backprop.GradientTape() as tape:
	val, idx = nn_ops.approx_max_k(db, k)
	result_in_grads = tape.gradient(val, db, out_grads)
	lifted_k_idx = array_ops.reshape(idx, [num_rows, k, 1])
	iota_idx = array_ops.broadcast_to(
	array_ops.reshape(math_ops.range(num_rows), [num_rows, 1, 1]),
	[num_rows, k, 1])
	lifted_idx = array_ops.concat([iota_idx, lifted_k_idx], axis=2)
	k_idx_s = array_ops.reshape(lifted_idx, [num_rows * k, 2])
	k_gra_s = array_ops.reshape(out_grads, [num_rows * k])
	expected_in_grads = array_ops.scatter_nd(k_idx_s, k_gra_s,
	[num_rows, row_size])
	return [expected_in_grads, result_in_grads]

	with ops.device('/device:TPU:0'):
	db_op = variables.Variable(db, dtype=dtype)
	out_grads_op = variables.Variable(out_grads, dtype=dtype)
	expected_in_grads, result_in_grads = ann_with_grads(db_op, out_grads_op)

	self.assertAllClose(expected_in_grads, result_in_grads)

	# Tests that multiple ops are supported and the comparison functions are
	# renamed properly to avoid conflict while using the MLIR bridge.
	def test_multiple_ops(self):
	k = 1

	row_size = 100
	num_rows = 10

	row = np.arange(row_size, dtype=np.float32)
	db1 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))
	db2 = np.stack(list(self._rng.permutation(row) for _ in range(num_rows)))

	@function(jit_compile=True)
	def ann(db1, db2):
	result1 = nn_ops.approx_max_k(db1, k, aggregate_to_topk=True)
	result2 = nn_ops.approx_max_k(db2, k, aggregate_to_topk=True)
	return (result1, result2)

	with ops.device('/device:TPU:0'):
	db1_op = variables.Variable(db1)
	db2_op = variables.Variable(db2)
	result1, result2 = ann(db1_op, db2_op)

	gt = np.argsort(-db1)[:, :k]
	ann_recall = self.compute_recall(result1[1].numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	gt = np.argsort(-db2)[:, :k]
	ann_recall = self.compute_recall(result2[1].numpy(), gt)
	self.assertGreaterEqual(ann_recall, 0.95)

	if __name__ == '__main__':
	test.main()