blob: 0b9241755e1297e424bdce640efec08b1a433b06 [file] [log] [blame]
# coding=utf-8
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Extract IR for training.
Extract IR for training, either from a compile_commands.json file produced by
cmake, or a linker parameter list file.
Only run with 'python compiler_opt/tools/extract_ir.py ...'
The compilation is assumed to have been performed with clang, using
-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
In a ThinLTO case, the compilation is assumed to have been performed specifying
-mllvm -lto-embed-bitcode=post-merge-pre-opt.
"""
import json
import multiprocessing
import os
import re
import shutil
import subprocess
from typing import Dict
from typing import List
from absl import app
from absl import flags
from absl import logging
flags.DEFINE_string(
'input', None,
'Input file - either compile_commands.json or a linker parameter list')
flags.DEFINE_enum(
'input_type', 'json', ['json', 'params'],
'Input file type - json or params. The latter refers to lld params.')
flags.DEFINE_string('output_dir', None, 'Output directory')
flags.DEFINE_integer(
'num_workers', None,
'Number of parallel workers for objcopy. `None` for maximum available.')
flags.DEFINE_string('llvm_objcopy_path', 'llvm-objcopy', 'Path to llvm-objcopy')
flags.DEFINE_string(
'obj_base_dir', '',
'Base directory for object files. Defaults to current working dir.')
flags.DEFINE_string(
'cmd_filter', None,
'Include only those modules with a command line matching this regexp. '
'Setting it to None for not filtering. Note that the regexp is applied '
'independently for each separate command line option. For example, ^-Oz$ '
'will match Oz - built binaries.')
flags.DEFINE_bool(
'thinlto_build', False, 'Set if the build was ThinLTO, to '
'ensure index files are also copied. The build is assumed to have had'
'-mllvm -lto-embed-bitcode=post-merge-pre-opt passed to clang.')
FLAGS = flags.FLAGS
# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
# \0 - separated list of strings, to a \n one.
def should_include_module(cmdline: str, match_regexp: str) -> bool:
"""Determine if the module should be included."""
if match_regexp is None:
return True
lines = cmdline.split('\0')
return any(len(re.findall(match_regexp, l)) for l in lines)
def get_thinlto_index(cmdline: str, basedir: str) -> str:
opts = cmdline.split('\0')
for option in opts:
if option.startswith('-fthinlto-index'):
return os.path.join(basedir, option.split('=')[1])
return None
class TrainingIRExtractor:
"""IR and command line extraction from an object file.
The object file is assumed to have the .llvmbc and .llvmcmd sections.
"""
def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
"""Set up a TrainingIRExtractor.
Args:
obj_relative_path: relative path to the input object file. It will be also
used to construct the absolute path of the output IR and cmd files, by
appending it to output_base_dir.
output_base_dir: the directory under which the output will be produced.
obj_base_dir: the base directory for all the input object files.
"""
self._obj_relative_path = obj_relative_path
self._output_base_dir = output_base_dir
self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ''
def obj_base_dir(self):
return self._obj_base_dir
def output_base_dir(self):
return self._output_base_dir
def relative_output_path(self):
return self._obj_relative_path
def input_obj(self):
return os.path.join(self.obj_base_dir(), self._obj_relative_path)
def dest_dir(self):
return os.path.join(self.output_base_dir(),
os.path.dirname(self._obj_relative_path))
def module_name(self):
return os.path.basename(self._obj_relative_path)
def cmd_file(self):
return os.path.join(self.dest_dir(), self.module_name() + '.cmd')
def bc_file(self):
return os.path.join(self.dest_dir(), self.module_name() + '.bc')
def thinlto_index_file(self):
return os.path.join(self.dest_dir(), self.module_name() + '.thinlto.bc')
def _get_extraction_cmd_command(self, llvm_objcopy_path):
"""Call llvm_objcopy to extract the .llvmcmd section in self._cmd_file."""
return [
llvm_objcopy_path, '--dump-section=.llvmcmd=' + self.cmd_file(),
self.input_obj(), '/dev/null'
]
def _get_extraction_bc_command(self, llvm_objcopy_path):
"""Call llvm_objcopy to extract the .llvmbc section in self._bc_file."""
return [
llvm_objcopy_path, '--dump-section=.llvmbc=' + self.bc_file(),
self.input_obj(), '/dev/null'
]
def extract(self, llvm_objcopy_path: str, cmd_filter: str,
is_thinlto: bool) -> str:
"""Run llvm-objcopy to extract the .bc and command line."""
if not os.path.exists(self.input_obj()):
logging.info('%s does not exist.', self.input_obj())
return None
os.makedirs(self.dest_dir(), exist_ok=True)
try:
subprocess.run(
self._get_extraction_cmd_command(llvm_objcopy_path), check=True)
if cmd_filter is not None or is_thinlto:
with open(self.cmd_file()) as f:
lines = f.readlines()
assert len(lines) == 1
cmdline = lines[0]
if not should_include_module(cmdline, cmd_filter):
logging.info(
'Excluding module %s because it does not match the filter',
self.input_obj())
os.remove(self.cmd_file())
return None
if is_thinlto:
index_file = get_thinlto_index(cmdline, self.obj_base_dir())
shutil.copy(index_file, self.thinlto_index_file())
subprocess.run(
self._get_extraction_bc_command(llvm_objcopy_path), check=True)
except subprocess.CalledProcessError as e:
# This may happen if .o file was build from asm (.S source).
logging.warning('%s was not processed: %s', self.input_obj(), e)
return None
assert (os.path.exists(self.cmd_file()) and
os.path.exists(self.bc_file()) and
(not is_thinlto or os.path.exists(self.thinlto_index_file())))
return self.relative_output_path()
def convert_compile_command_to_objectfile(command: Dict[str, str],
output_dir: str):
obj_base_dir = command['directory']
cmd = command['command']
cmd_parts = cmd.split()
obj_index = cmd_parts.index('-o') + 1
obj_rel_path = cmd_parts[obj_index]
# TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
return TrainingIRExtractor(
obj_relative_path=obj_rel_path,
output_base_dir=output_dir,
obj_base_dir=obj_base_dir)
def load_from_compile_commands(json_array: List[Dict[str, str]],
output_dir: str) -> List[TrainingIRExtractor]:
return [
convert_compile_command_to_objectfile(cmd, output_dir)
for cmd in json_array
]
def load_from_lld_params(params_array: List[str], obj_base_dir: str,
output_dir: str) -> List[TrainingIRExtractor]:
"""Create an ObjectFile array based on lld's parameters."""
# yank out -o and the output. After that, anything not starting with '-', and
# ending in a '.o', is an object file.
try:
minus_o_idx = params_array.index('-o')
del params_array[minus_o_idx:minus_o_idx + 2]
just_obj_paths = [
o for o in params_array if not o.startswith('-') and o.endswith('.o')
]
except ValueError:
logging.info('This params file does not have an explicit -o option.')
just_obj_paths = params_array
def make_obj(obj_file: str) -> TrainingIRExtractor:
return TrainingIRExtractor(
obj_relative_path=obj_file,
output_base_dir=output_dir,
obj_base_dir=obj_base_dir)
return [make_obj(obj_file) for obj_file in just_obj_paths]
# This is here just for readability, lint complains if the pooling expression is
# over 3 lines; and it needs to be a non-local so it may be pickled.
def extract_artifacts(obj: TrainingIRExtractor) -> str:
return obj.extract(FLAGS.llvm_objcopy_path, FLAGS.cmd_filter,
FLAGS.thinlto_build)
def main(argv):
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
flags.mark_flags_as_required(['output_dir', 'input'])
objs = []
if FLAGS.input_type == 'json':
with open(FLAGS.input) as f:
objs = load_from_compile_commands(json.load(f), FLAGS.output_dir)
elif FLAGS.input_type == 'params':
if not FLAGS.obj_base_dir:
logging.info(
'-obj_base_dir is unspecified, assuming current directory.'
'If no objects are found, use this option to specify the root'
'directory for the object file paths in the input file.')
with open(FLAGS.input) as f:
objs = load_from_lld_params([l.strip() for l in f.readlines()],
FLAGS.obj_base_dir, FLAGS.output_dir)
else:
logging.error('Unknown input type: %s', FLAGS.input_type)
pool = multiprocessing.Pool(FLAGS.num_workers)
relative_output_paths = pool.map(extract_artifacts, objs)
# Write all Non-None relative paths to FLAGS.output_dir/module_paths.
with open(os.path.join(FLAGS.output_dir, 'module_paths'), 'w') as f:
for path in relative_output_paths:
if path is not None:
f.write(path + '\n')
logging.info('Converted %d files out of %d',
len(objs) - relative_output_paths.count(None), len(objs))
if __name__ == '__main__':
app.run(main)