Add group functions library

This patch adds a library and associated unit tests for grouping
functions emitted from the extract_functions script for more efficient
compilation.

Reviewers: svkeerthy, mtrofin

Reviewed By: mtrofin

Pull Request: https://github.com/google/ml-compiler-opt/pull/507
diff --git a/compiler_opt/tools/regalloc_trace/group_functions_lib.py b/compiler_opt/tools/regalloc_trace/group_functions_lib.py
new file mode 100644
index 0000000..29c6f78
--- /dev/null
+++ b/compiler_opt/tools/regalloc_trace/group_functions_lib.py
@@ -0,0 +1,139 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A library that contains utilities for grouping functions."""
+
+import dataclasses
+import os
+import math
+import subprocess
+import json
+
+from compiler_opt.rl import corpus
+
+
+@dataclasses.dataclass(frozen=True)
+class FunctionPathAndSize:
+  path: str
+  size: int
+
+
+def _get_functions_chunked_by_command_line(
+    function_folder: str, delete_flags: tuple[str, ...] = ()
+) -> dict[tuple[str], list[str]]:
+  """Groups functions by their command line.
+
+  This function takes in a path to a corpus containing modules that all contain
+  single functions (from the extract_functions script). It then arranges these
+  functions by their commandline, ensuring all functions with the same
+  commandline (minus the flags in delete_flags) are in the same chunk.
+
+  Args:
+    function_folder: The path to the corpus containing the individual functions
+      to process.
+    delete_flags: The flags to delete from each of the command lines. This
+      should contain all flags that make each command line unique.
+
+  Returns:
+    A dictionary mapping the command line stored as a tuple to a list of
+    function names.
+  """
+  function_corpus = corpus.Corpus(
+      data_path=function_folder,
+      delete_flags=delete_flags,
+      construct_cmd_for_compilation=False)
+  command_lines = {}
+
+  for module_spec in function_corpus.module_specs:
+    function_path = os.path.join(function_folder, module_spec.name + '.bc')
+    function_size = module_spec.size
+    function_path_and_size = FunctionPathAndSize(function_path, function_size)
+    module_command_line = tuple(module_spec.command_line)
+    if module_command_line in command_lines:
+      command_lines[module_command_line].append(function_path_and_size)
+    else:
+      command_lines[module_command_line] = [function_path_and_size]
+
+  for command_line in command_lines:
+    command_lines[command_line] = sorted(
+        command_lines[command_line],
+        key=lambda function_path_and_size: function_path_and_size.size,
+        reverse=True)
+
+  final_command_lines = {}
+  for command_line, sorted_functions in command_lines.items():
+    final_command_lines[command_line] = [
+        function_path_and_size.path
+        for function_path_and_size in sorted_functions
+    ]
+
+  return final_command_lines
+
+
+def _partition_functions(
+    functions_per_command_line: dict[tuple[str], list[str]],
+    max_functions_per_chunk: int) -> dict[tuple[str], list[list[str]]]:
+  corpus_chunks = {}
+  for command_line in functions_per_command_line:
+    corpus_chunks[command_line] = []
+    chunks_for_command_line = math.ceil(
+        len(functions_per_command_line[command_line]) / max_functions_per_chunk)
+    if chunks_for_command_line == 0:
+      raise RuntimeError('Expected chunks_for_command_line to be greater than '
+                         f'zero, actually got {chunks_for_command_line}.')
+    for chunk_index in range(0, chunks_for_command_line):
+      current_index = chunk_index
+      current_chunk = []
+      while current_index < len(functions_per_command_line[command_line]):
+        function_path = functions_per_command_line[command_line][current_index]
+        current_chunk.append(function_path)
+        current_index += chunks_for_command_line
+      corpus_chunks[command_line].append(current_chunk)
+  return corpus_chunks
+
+
+def get_chunks(
+    function_folder: str, delete_flags: tuple[str, ...],
+    max_functions_per_chunk: int) -> dict[tuple[str], list[list[str]]]:
+  chunked_functions = _get_functions_chunked_by_command_line(
+      function_folder, delete_flags)
+  partitioned_functions = _partition_functions(chunked_functions,
+                                               max_functions_per_chunk)
+  return partitioned_functions
+
+
+def combine_chunks(function_chunks: dict[tuple[str], list[list[str]]],
+                   llvm_link_path: str, output_folder: str):
+  corpus_chunk_index = 0
+  for command_line in function_chunks:
+    for function_chunk in function_chunks[command_line]:
+      output_file = os.path.join(output_folder, f'{corpus_chunk_index}.bc')
+      command_vector = [llvm_link_path, '-o', output_file]
+      command_vector.extend(function_chunk)
+      subprocess.run(command_vector, capture_output=True, check=True)
+
+      output_cmd_file = os.path.join(output_folder, f'{corpus_chunk_index}.cmd')
+      with open(
+          output_cmd_file, 'w', encoding='utf-8') as output_cmd_file_handle:
+        output_cmd_file_handle.write('\0'.join(command_line))
+      corpus_chunk_index += 1
+
+  with open(
+      os.path.join(output_folder, 'corpus_description.json'),
+      'w',
+      encoding='utf-8') as corpus_description_handle:
+    corpus_description = {
+        'has_thinlto': False,
+        'modules': [str(index) for index in range(0, corpus_chunk_index + 1)]
+    }
+    json.dump(corpus_description, corpus_description_handle)
diff --git a/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py b/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py
new file mode 100644
index 0000000..e420f07
--- /dev/null
+++ b/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for group_functions_lib."""
+
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools.regalloc_trace import group_functions_lib
+from compiler_opt.rl import corpus
+from compiler_opt.testing import corpus_test_utils
+
+
+class GroupFunctionsTest(absltest.TestCase):
+
+  def test_get_chunks_one_command_line(self):
+    corpus_folder = self.create_tempdir()
+    corpus.create_corpus_for_testing(
+        corpus_folder.full_path,
+        elements=[
+            corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+            corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+        ])
+    corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+                                                   2)
+    self.assertDictEqual(
+        corpus_chunks, {
+            ('-cc1',): [[
+                os.path.join(corpus_folder.full_path, 'module1.bc'),
+                os.path.join(corpus_folder.full_path, 'module2.bc')
+            ]]
+        })
+
+  def test_get_chunks_two_command_lines(self):
+    corpus_folder = self.create_tempdir()
+    corpus.create_corpus_for_testing(
+        corpus_folder.full_path,
+        elements=[
+            corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+            corpus.ModuleSpec(
+                name='module2', size=5, command_line=('-cc1', '-O3'))
+        ])
+    corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+                                                   2)
+    self.assertDictEqual(
+        corpus_chunks, {
+            ('-cc1',): [[
+                os.path.join(corpus_folder.full_path, 'module1.bc'),
+            ]],
+            ('-cc1', '-O3'):
+                [[os.path.join(corpus_folder.full_path, 'module2.bc')]]
+        })
+
+  def test_get_chunks_multiple_chunks(self):
+    corpus_folder = self.create_tempdir()
+    corpus.create_corpus_for_testing(
+        corpus_folder.full_path,
+        elements=[
+            corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+            corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+        ])
+    corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+                                                   1)
+    self.assertDictEqual(
+        corpus_chunks, {
+            ('-cc1',): [[os.path.join(corpus_folder.full_path, 'module1.bc')],
+                        [os.path.join(corpus_folder.full_path, 'module2.bc')]]
+        })
+
+  def test_get_chunks_multiple_uneven_chunks(self):
+    corpus_folder = self.create_tempdir()
+    corpus.create_corpus_for_testing(
+        corpus_folder.full_path,
+        elements=[
+            corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+            corpus.ModuleSpec(name='module2', size=4, command_line=('-cc1',)),
+            corpus.ModuleSpec(name='module3', size=3, command_line=('-cc1',)),
+        ])
+    corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+                                                   2)
+    self.assertDictEqual(
+        corpus_chunks, {
+            ('-cc1',): [[
+                os.path.join(corpus_folder.full_path, 'module1.bc'),
+                os.path.join(corpus_folder.full_path, 'module3.bc')
+            ], [os.path.join(corpus_folder.full_path, 'module2.bc')]]
+        })
+
+  def test_combine_chunks(self):
+    corpus_folder = self.create_tempdir()
+    corpus.create_corpus_for_testing(
+        corpus_folder.full_path,
+        elements=[
+            corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+            corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+        ])
+    corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+                                                   2)
+    fake_llvm_link_binary = self.create_tempfile('fake_llvm_link')
+    fake_llvm_link_invocations = self.create_tempfile(
+        'fake_llvm_link_invocations')
+    corpus_test_utils.create_test_binary(fake_llvm_link_binary.full_path,
+                                         fake_llvm_link_invocations.full_path,
+                                         ['touch $2'])
+
+    output_folder = self.create_tempdir()
+    group_functions_lib.combine_chunks(corpus_chunks,
+                                       fake_llvm_link_binary.full_path,
+                                       output_folder.full_path)
+    self.assertContainsSubset(
+        os.listdir(output_folder.full_path),
+        ['corpus_description.json', '0.bc', '0.cmd'])
+
+
+if __name__ == '__main__':
+  absltest.main()