Add group functions library
This patch adds a library and associated unit tests for grouping
functions emitted from the extract_functions script for more efficient
compilation.
Reviewers: svkeerthy, mtrofin
Reviewed By: mtrofin
Pull Request: https://github.com/google/ml-compiler-opt/pull/507
diff --git a/compiler_opt/tools/regalloc_trace/group_functions_lib.py b/compiler_opt/tools/regalloc_trace/group_functions_lib.py
new file mode 100644
index 0000000..29c6f78
--- /dev/null
+++ b/compiler_opt/tools/regalloc_trace/group_functions_lib.py
@@ -0,0 +1,139 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A library that contains utilities for grouping functions."""
+
+import dataclasses
+import os
+import math
+import subprocess
+import json
+
+from compiler_opt.rl import corpus
+
+
+@dataclasses.dataclass(frozen=True)
+class FunctionPathAndSize:
+ path: str
+ size: int
+
+
+def _get_functions_chunked_by_command_line(
+ function_folder: str, delete_flags: tuple[str, ...] = ()
+) -> dict[tuple[str], list[str]]:
+ """Groups functions by their command line.
+
+ This function takes in a path to a corpus containing modules that all contain
+ single functions (from the extract_functions script). It then arranges these
+ functions by their commandline, ensuring all functions with the same
+ commandline (minus the flags in delete_flags) are in the same chunk.
+
+ Args:
+ function_folder: The path to the corpus containing the individual functions
+ to process.
+ delete_flags: The flags to delete from each of the command lines. This
+ should contain all flags that make each command line unique.
+
+ Returns:
+ A dictionary mapping the command line stored as a tuple to a list of
+ function names.
+ """
+ function_corpus = corpus.Corpus(
+ data_path=function_folder,
+ delete_flags=delete_flags,
+ construct_cmd_for_compilation=False)
+ command_lines = {}
+
+ for module_spec in function_corpus.module_specs:
+ function_path = os.path.join(function_folder, module_spec.name + '.bc')
+ function_size = module_spec.size
+ function_path_and_size = FunctionPathAndSize(function_path, function_size)
+ module_command_line = tuple(module_spec.command_line)
+ if module_command_line in command_lines:
+ command_lines[module_command_line].append(function_path_and_size)
+ else:
+ command_lines[module_command_line] = [function_path_and_size]
+
+ for command_line in command_lines:
+ command_lines[command_line] = sorted(
+ command_lines[command_line],
+ key=lambda function_path_and_size: function_path_and_size.size,
+ reverse=True)
+
+ final_command_lines = {}
+ for command_line, sorted_functions in command_lines.items():
+ final_command_lines[command_line] = [
+ function_path_and_size.path
+ for function_path_and_size in sorted_functions
+ ]
+
+ return final_command_lines
+
+
+def _partition_functions(
+ functions_per_command_line: dict[tuple[str], list[str]],
+ max_functions_per_chunk: int) -> dict[tuple[str], list[list[str]]]:
+ corpus_chunks = {}
+ for command_line in functions_per_command_line:
+ corpus_chunks[command_line] = []
+ chunks_for_command_line = math.ceil(
+ len(functions_per_command_line[command_line]) / max_functions_per_chunk)
+ if chunks_for_command_line == 0:
+ raise RuntimeError('Expected chunks_for_command_line to be greater than '
+ f'zero, actually got {chunks_for_command_line}.')
+ for chunk_index in range(0, chunks_for_command_line):
+ current_index = chunk_index
+ current_chunk = []
+ while current_index < len(functions_per_command_line[command_line]):
+ function_path = functions_per_command_line[command_line][current_index]
+ current_chunk.append(function_path)
+ current_index += chunks_for_command_line
+ corpus_chunks[command_line].append(current_chunk)
+ return corpus_chunks
+
+
+def get_chunks(
+ function_folder: str, delete_flags: tuple[str, ...],
+ max_functions_per_chunk: int) -> dict[tuple[str], list[list[str]]]:
+ chunked_functions = _get_functions_chunked_by_command_line(
+ function_folder, delete_flags)
+ partitioned_functions = _partition_functions(chunked_functions,
+ max_functions_per_chunk)
+ return partitioned_functions
+
+
+def combine_chunks(function_chunks: dict[tuple[str], list[list[str]]],
+ llvm_link_path: str, output_folder: str):
+ corpus_chunk_index = 0
+ for command_line in function_chunks:
+ for function_chunk in function_chunks[command_line]:
+ output_file = os.path.join(output_folder, f'{corpus_chunk_index}.bc')
+ command_vector = [llvm_link_path, '-o', output_file]
+ command_vector.extend(function_chunk)
+ subprocess.run(command_vector, capture_output=True, check=True)
+
+ output_cmd_file = os.path.join(output_folder, f'{corpus_chunk_index}.cmd')
+ with open(
+ output_cmd_file, 'w', encoding='utf-8') as output_cmd_file_handle:
+ output_cmd_file_handle.write('\0'.join(command_line))
+ corpus_chunk_index += 1
+
+ with open(
+ os.path.join(output_folder, 'corpus_description.json'),
+ 'w',
+ encoding='utf-8') as corpus_description_handle:
+ corpus_description = {
+ 'has_thinlto': False,
+ 'modules': [str(index) for index in range(0, corpus_chunk_index + 1)]
+ }
+ json.dump(corpus_description, corpus_description_handle)
diff --git a/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py b/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py
new file mode 100644
index 0000000..e420f07
--- /dev/null
+++ b/compiler_opt/tools/regalloc_trace/group_functions_lib_test.py
@@ -0,0 +1,127 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Unit tests for group_functions_lib."""
+
+import os
+
+from absl.testing import absltest
+
+from compiler_opt.tools.regalloc_trace import group_functions_lib
+from compiler_opt.rl import corpus
+from compiler_opt.testing import corpus_test_utils
+
+
+class GroupFunctionsTest(absltest.TestCase):
+
+ def test_get_chunks_one_command_line(self):
+ corpus_folder = self.create_tempdir()
+ corpus.create_corpus_for_testing(
+ corpus_folder.full_path,
+ elements=[
+ corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+ corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+ ])
+ corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+ 2)
+ self.assertDictEqual(
+ corpus_chunks, {
+ ('-cc1',): [[
+ os.path.join(corpus_folder.full_path, 'module1.bc'),
+ os.path.join(corpus_folder.full_path, 'module2.bc')
+ ]]
+ })
+
+ def test_get_chunks_two_command_lines(self):
+ corpus_folder = self.create_tempdir()
+ corpus.create_corpus_for_testing(
+ corpus_folder.full_path,
+ elements=[
+ corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+ corpus.ModuleSpec(
+ name='module2', size=5, command_line=('-cc1', '-O3'))
+ ])
+ corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+ 2)
+ self.assertDictEqual(
+ corpus_chunks, {
+ ('-cc1',): [[
+ os.path.join(corpus_folder.full_path, 'module1.bc'),
+ ]],
+ ('-cc1', '-O3'):
+ [[os.path.join(corpus_folder.full_path, 'module2.bc')]]
+ })
+
+ def test_get_chunks_multiple_chunks(self):
+ corpus_folder = self.create_tempdir()
+ corpus.create_corpus_for_testing(
+ corpus_folder.full_path,
+ elements=[
+ corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+ corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+ ])
+ corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+ 1)
+ self.assertDictEqual(
+ corpus_chunks, {
+ ('-cc1',): [[os.path.join(corpus_folder.full_path, 'module1.bc')],
+ [os.path.join(corpus_folder.full_path, 'module2.bc')]]
+ })
+
+ def test_get_chunks_multiple_uneven_chunks(self):
+ corpus_folder = self.create_tempdir()
+ corpus.create_corpus_for_testing(
+ corpus_folder.full_path,
+ elements=[
+ corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+ corpus.ModuleSpec(name='module2', size=4, command_line=('-cc1',)),
+ corpus.ModuleSpec(name='module3', size=3, command_line=('-cc1',)),
+ ])
+ corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+ 2)
+ self.assertDictEqual(
+ corpus_chunks, {
+ ('-cc1',): [[
+ os.path.join(corpus_folder.full_path, 'module1.bc'),
+ os.path.join(corpus_folder.full_path, 'module3.bc')
+ ], [os.path.join(corpus_folder.full_path, 'module2.bc')]]
+ })
+
+ def test_combine_chunks(self):
+ corpus_folder = self.create_tempdir()
+ corpus.create_corpus_for_testing(
+ corpus_folder.full_path,
+ elements=[
+ corpus.ModuleSpec(name='module1', size=5, command_line=('-cc1',)),
+ corpus.ModuleSpec(name='module2', size=3, command_line=('-cc1',))
+ ])
+ corpus_chunks = group_functions_lib.get_chunks(corpus_folder.full_path, (),
+ 2)
+ fake_llvm_link_binary = self.create_tempfile('fake_llvm_link')
+ fake_llvm_link_invocations = self.create_tempfile(
+ 'fake_llvm_link_invocations')
+ corpus_test_utils.create_test_binary(fake_llvm_link_binary.full_path,
+ fake_llvm_link_invocations.full_path,
+ ['touch $2'])
+
+ output_folder = self.create_tempdir()
+ group_functions_lib.combine_chunks(corpus_chunks,
+ fake_llvm_link_binary.full_path,
+ output_folder.full_path)
+ self.assertContainsSubset(
+ os.listdir(output_folder.full_path),
+ ['corpus_description.json', '0.bc', '0.cmd'])
+
+
+if __name__ == '__main__':
+ absltest.main()