blob: 4d65146000e8f0dedc224d54b19dc0da8be7abc1 [file] [log] [blame]
# coding=utf-8
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Corpus and related concepts."""
import abc
import concurrent.futures
import math
import random
import re
from absl import logging
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
import json
import os
import tensorflow as tf
from compiler_opt.rl import constant
# Alias to better self-document APIs. Represents a complete, ready to use
# command line, where all the flags reference existing, local files.
FullyQualifiedCmdLine = Tuple[str, ...]
def _apply_cmdline_filters(
orig_options: Tuple[str, ...],
additional_flags: Tuple[str, ...] = (),
delete_flags: Tuple[str, ...] = (),
replace_flags: Optional[Dict[str, str]] = None) -> Tuple[str]:
option_iterator = iter(orig_options)
matched_replace_flags = set()
replace_flags = replace_flags if replace_flags is not None else {}
option = next(option_iterator, None)
cmdline = []
while option is not None:
if any(option.startswith(flag) for flag in delete_flags):
if '=' not in option:
next(option_iterator, None)
else:
matching_replace = [
flag for flag in replace_flags if option.startswith(flag)
]
if not matching_replace:
cmdline.append(option)
else:
assert len(matching_replace) == 1
flag = matching_replace[0]
if flag in matched_replace_flags:
raise ValueError(f'{flag} was matched twice')
matched_replace_flags.add(flag)
if '=' not in option:
next(option_iterator, None)
cmdline.extend([option, replace_flags[flag]])
else:
cmdline.append(flag + '=' + replace_flags[flag])
option = next(option_iterator, None)
if len(matched_replace_flags) != len(replace_flags):
raise ValueError('flags that were expected to be replaced were not found')
cmdline.extend(additional_flags)
return tuple(cmdline)
@dataclass(frozen=True)
class LoadedModuleSpec:
"""Encapsulates the loaded data of a module and the rules to persist it.
A LoadedModuleSpec can be passed to a remote location. There, given a local
directory, to_module_spec can be called, resulting in the data being saved
under that directory, the final compiler command line fully computed, and a
ready-to-use FullyQualifiedCmdLine returned.
"""
name: str
loaded_ir: bytes
loaded_thinlto_index: Optional[bytes] = None
orig_options: Tuple[str, ...] = ()
def _create_files_and_get_context(self, local_dir: str):
root_dir = os.path.join(local_dir, self.name)
os.makedirs(root_dir, exist_ok=True)
module_path = os.path.join(root_dir, 'input.bc')
thinlto_index_path = None
with tf.io.gfile.GFile(module_path, 'wb') as f:
f.write(self.loaded_ir)
if self.loaded_thinlto_index is not None:
thinlto_index_path = os.path.join(root_dir, 'index.thinlto.bc')
with tf.io.gfile.GFile(thinlto_index_path, 'wb') as f:
f.write(self.loaded_thinlto_index)
context = Corpus.ReplaceContext(
module_full_path=module_path, thinlto_full_path=thinlto_index_path)
return context
def build_command_line(self, local_dir: str) -> FullyQualifiedCmdLine:
"""Different LoadedModuleSpec objects must get different `local_dir`s."""
context = self._create_files_and_get_context(local_dir)
return tuple(option.format(context=context) for option in self.orig_options)
@dataclass(frozen=True)
class ModuleSpec:
"""Metadata of a compilation unit.
This contains the necessary information to enable corpus operations like
sampling or filtering, as well as to enable the corpus create
a LoadedModuleSpec from a CorpusElement.
"""
name: str
size: int
command_line: Tuple[str, ...] = ()
has_thinlto: bool = False
class Sampler(metaclass=abc.ABCMeta):
"""Corpus sampler abstraction."""
@abc.abstractmethod
def __call__(self,
module_specs: Tuple[ModuleSpec],
k: int,
n: int = 20) -> List[ModuleSpec]:
"""
Args:
module_specs: list of module_specs to sample from
k: number of modules to sample
n: number of buckets to use
"""
raise NotImplementedError()
class SamplerBucketRoundRobin(Sampler):
"""Calls return a list of module_specs sampled randomly from n buckets, in
round-robin order. The buckets are sequential sections of module_specs of
roughly equal lengths."""
def __init__(self):
self._ranges = {}
def __call__(self,
module_specs: Tuple[ModuleSpec],
k: int,
n: int = 20) -> List[ModuleSpec]:
"""
Args:
module_specs: list of module_specs to sample from
k: number of modules to sample
n: number of buckets to use
"""
# Credits to yundi@ for the highly optimized algo.
# Essentially, split module_specs into k buckets, then define the order of
# visiting the k buckets such that it approximates the behaviour of having
# n buckets.
specs_len = len(module_specs)
if (specs_len, k, n) not in self._ranges:
quotient = k // n
# rev_map maps from bucket # (implicitly via index) to order of visiting.
# lower values should be visited first, and earlier indices before later.
rev_map = [i % quotient for i in range(k)] if quotient else [0] * k
# mapping defines the order in which buckets should be visited.
mapping = [t[0] for t in sorted(enumerate(rev_map), key=lambda x: x[1])]
# generate the buckets ranges, in the order which they should be visited.
bucket_size_float = specs_len / k
self._ranges[(specs_len, k, n)] = tuple(
(math.floor(bucket_size_float * i),
math.floor(bucket_size_float * (i + 1))) for i in mapping)
return [
module_specs[random.randrange(start, end)]
for start, end in self._ranges[(specs_len, k, n)]
]
class Corpus:
"""Represents a corpus.
A corpus is created from a corpus_description.json file, produced by
extract_ir.py (for example).
To use the corpus:
- call sample to get a subset of modules (using the Sampler provided at
initialization time). This returns a list of ModuleSpec objects
- convert the ModuleSpecs to LoadedModuleSpecs. This loads the contents of the
modules in memory (hence this lazy approach). The caller may want to perform
this step with a threadpool
- pass the LoadedModuleSpecs to Workers
- to use a LoadedModuleSpec, create a unique directory (i.e. tempdir) and
pass it to to_module_spec
Example:
corpus = Corpus(...)
samples = corpus.sample(10)
with ThreadPoolExecutor() as tp:
futures = [tp.submit(corpus.load_module_spec, s) for s in samples]
...
lms = [f.result() for f in futures]
...(pass lms values to workers)
On the worker side:
lm: LoadedModuleSpec = ...
with tempfile.mkdir() as tempdir:
final_cmd_line = lm.build_command_line(tempdir)
...(prepend executable to final_cmd_line, run it)
"""
@dataclass(frozen=True)
class ReplaceContext:
"""Context for 'replace' rules."""
module_full_path: str
thinlto_full_path: Optional[str] = None
def __init__(self,
*,
data_path: str,
module_filter: Optional[re.Pattern] = None,
additional_flags: Tuple[str, ...] = (),
delete_flags: Tuple[str, ...] = (),
replace_flags: Optional[Dict[str, str]] = None,
sampler: Sampler = SamplerBucketRoundRobin()):
"""
Prepares the corpus by pre-loading all the CorpusElements and preparing for
sampling. Command line origin (.cmd file or override) is decided, and final
command line transformation rules are set (i.e. thinlto flags handled, also
output) and validated.
Args:
data_path: corpus directory.
additional_flags: list of flags to append to the command line
delete_flags: list of flags to remove (both `-flag=<value` and
`-flag <value>` are supported).
replace_flags: list of flags to be replaced. The key in the dictionary
is the flag. The value is a string that will be `format`-ed with a
`context` object - see `ReplaceContext`.
We verify that flags in replace_flags are present, and do not appear
in the additional_flags nor delete_flags.
Thinlto index is handled this way, too.
module_filter: a regular expression used to filter 'in' modules with names
matching it. None to include everything.
"""
self._base_dir = data_path
self._sampler = sampler
# TODO: (b/233935329) Per-corpus *fdo profile paths can be read into
# {additional|delete}_flags here
with tf.io.gfile.GFile(
os.path.join(data_path, 'corpus_description.json'), 'r') as f:
corpus_description: Dict[str, Any] = json.load(f)
module_paths = corpus_description['modules']
if len(module_paths) == 0:
raise ValueError(
f'{data_path}\'s corpus_description contains no modules.')
has_thinlto: bool = corpus_description['has_thinlto']
cmd_override = ()
cmd_override_was_specified = False
if 'global_command_override' in corpus_description:
cmd_override_was_specified = True
if corpus_description[
'global_command_override'] == constant.UNSPECIFIED_OVERRIDE:
raise ValueError(
'global_command_override in corpus_description.json not filled.')
cmd_override = tuple(corpus_description['global_command_override'])
if len(additional_flags) > 0:
logging.warning(
'Additional flags are specified together with override.')
if len(delete_flags) > 0:
logging.warning('Delete flags are specified together with override.')
if replace_flags:
logging.warning('Replace flags are specified together with override.')
replace_flags = replace_flags.copy() if replace_flags else {}
fthinlto_index_flag = '-fthinlto-index'
if has_thinlto:
additional_flags = ('-mllvm', '-thinlto-assume-merged') + additional_flags
if cmd_override_was_specified:
additional_flags = (f'{fthinlto_index_flag}=' +
'{context.thinlto_full_path}',) + additional_flags
else:
if fthinlto_index_flag in replace_flags:
raise ValueError(
'-fthinlto-index must be handled by the infrastructure')
replace_flags[fthinlto_index_flag] = '{context.thinlto_full_path}'
additional_flags = ('-x', 'ir',
'{context.module_full_path}') + additional_flags
# don't use add/remove for replace
add_keys = set(k.split('=', maxsplit=1)[0] for k in additional_flags)
if add_keys.intersection(
set(replace_flags)) or set(delete_flags).intersection(
set(replace_flags)) or add_keys.intersection(set(delete_flags)):
raise ValueError('do not use add/delete flags to replace')
if module_filter:
module_paths = [
name for name in module_paths if module_filter.match(name)
]
def get_cmdline(name: str):
if cmd_override_was_specified:
ret = cmd_override
else:
with tf.io.gfile.GFile(os.path.join(data_path, name + '.cmd')) as f:
ret = tuple(f.read().split('\0'))
# The options read from a .cmd file must be run with -cc1
if ret[0] != '-cc1':
raise ValueError('-cc1 flag not present in .cmd file.')
return _apply_cmdline_filters(
orig_options=ret,
additional_flags=additional_flags,
delete_flags=delete_flags,
replace_flags=replace_flags)
# perform concurrently because fetching file size may be slow (remote)
with concurrent.futures.ThreadPoolExecutor() as tp:
contents = tp.map(
lambda name: ModuleSpec(
name=name,
size=tf.io.gfile.GFile(os.path.join(data_path, name + '.bc')).
size(),
command_line=get_cmdline(name),
has_thinlto=has_thinlto), module_paths)
self._module_specs = tuple(
sorted(contents, key=lambda m: m.size, reverse=True))
def sample(self, k: int, sort: bool = False) -> List[ModuleSpec]:
"""Samples `k` module_specs, optionally sorting by size descending.
Use load_corpus_element to get LoadedModuleSpecs - this allows the user
decide how the loading should happen (e.g. may want to use a threadpool)
"""
# Note: sampler is intentionally defaulted to a mutable object, as the
# only mutable attribute of SamplerBucketRoundRobin is its range cache.
k = min(len(self._module_specs), k)
if k < 1:
raise ValueError('Attempting to sample <1 module specs from corpus.')
sampled_specs = self._sampler(self._module_specs, k=k)
if sort:
sampled_specs.sort(key=lambda m: m.size, reverse=True)
return sampled_specs
def load_module_spec(self, module_spec: ModuleSpec) -> LoadedModuleSpec:
with tf.io.gfile.GFile(
os.path.join(self._base_dir, module_spec.name + '.bc'), 'rb') as f:
module_bytes = f.read()
thinlto_bytes = None
if module_spec.has_thinlto:
with tf.io.gfile.GFile(
os.path.join(self._base_dir, module_spec.name + '.thinlto.bc'),
'rb') as f:
thinlto_bytes = f.read()
return LoadedModuleSpec(
name=module_spec.name,
loaded_ir=module_bytes,
loaded_thinlto_index=thinlto_bytes,
orig_options=module_spec.command_line)
@property
def module_specs(self):
return self._module_specs
def __len__(self):
return len(self._module_specs)
def create_corpus_for_testing(location: str,
elements: List[ModuleSpec],
cmdline: Tuple[str, ...] = ('-cc1',),
cmdline_is_override=False,
is_thinlto=False,
**kwargs) -> Corpus:
os.makedirs(location, exist_ok=True)
for element in elements:
with tf.io.gfile.GFile(os.path.join(location, element.name + '.bc'),
'wb') as f:
f.write(bytes([1] * element.size))
if not cmdline_is_override:
with tf.io.gfile.GFile(
os.path.join(location, element.name + '.cmd'), 'w') as f:
f.write('\0'.join(cmdline))
if is_thinlto:
with tf.io.gfile.GFile(
os.path.join(location, element.name + '.thinlto.bc'), 'w') as f:
f.write('')
corpus_description = {
'modules': [e.name for e in elements],
'has_thinlto': is_thinlto,
}
if cmdline_is_override:
corpus_description['global_command_override'] = cmdline
with tf.io.gfile.GFile(
os.path.join(location, 'corpus_description.json'), 'w') as f:
f.write(json.dumps(corpus_description))
return Corpus(data_path=location, **kwargs)