blob: 3617a31649a7c75ed2d5a2f4f0394b78ed1a612d [file] [log] [blame]
#!/usr/bin/env python3.8
# Copyright 2021 The Fuchsia Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Wraps a command so that its outputs are timestamp-fresh only if their contents change.
Every declared output is renamed with a temporary suffix in the command.
If the command succeeds, the temporary file is moved over the original declared
output if the output did not already exist or the contents are different.
This conditional move is done for every declared output that appears in the
arguments list.
This is intended to be used in build systems like Ninja that support `restat`:
treating unchanged outputs as up-to-date, which has the potential to prune
the action graph on-the-fly.
Assumptions:
Output files can be whole shell tokens in the command's arguments.
We also support filenames as lexical substrings in tokens like
"--flag=out1,out2" or just "out1,out2".
If x is a writeable path (output), then x.any_suffix is also writeable.
If x is a writeable path (output), then dirname(x) is also writeable.
Command being wrapped does not change behavior with the name of its output
arguments.
If any of the above assumptions do not hold, then we recommend --disable
wrapping.
"""
import argparse
import filecmp
import os
import shutil
import subprocess
import sys
import time
from typing import Any, Callable, Collection, Dict, Iterable, Sequence, Tuple
import dataclasses
_SCRIPT_BASENAME = os.path.basename(__file__)
def _partition(
iterable: Iterable[Any],
predicate: Callable[[Any],
bool]) -> Tuple[Sequence[Any], Sequence[Any]]:
"""Splits sequence into two sequences based on predicate function."""
trues = []
falses = []
for item in iterable:
if predicate(item):
trues.append(item)
else:
falses.append(item)
return trues, falses
def files_match(file1: str, file2: str):
"""Compares two files, returns True if they both exist and match."""
# filecmp.cmp does not invoke any subprocesses.
return filecmp.cmp(file1, file2, shallow=False)
def ensure_file_exists(path):
"""Assert that a file exists, or wait for it to appear.
It has been shown that some fault tolerance is needed
regarding expecting files to be produced by a subprocess.
Args:
path: path to file that is expected to exist.
Raises:
FileNotFoundError if path does not exist, even after waiting.
"""
for delay in (3, 6, 15):
if os.path.exists(path):
return
# This branch should be highly unlikely, so it is allowed to be slow.
# Either the original command failed to produce this file, or something
# could be wrong with file system synchronization or delays.
# Flush writes, sleep, try again.
print(
f"[{_SCRIPT_BASENAME}] Expected output file not found: {path} (Retrying after {delay}s ...)"
)
os.sync()
time.sleep(delay)
raise FileNotFoundError(
f"[{_SCRIPT_BASENAME}] *** Expected output file not found: {path}")
def retry_file_op_once_with_delay(
fileop: Callable[[], Any], failmsg: str, delay: int):
"""Insanity is doing the same thing and expecting a different result."""
try:
fileop()
except FileNotFoundError:
# one-time retry
print(
f'[{_SCRIPT_BASENAME}] {failmsg} (Retrying once after {delay}s.)')
time.sleep(delay)
fileop()
# If this fails again, exception will be raised.
def move_if_different(src: str, dest: str, verbose: bool = False) -> bool:
"""Moves src -> dest if their contents differ.
Args:
src: source path
dest: destination path
verbose: if True, print whether a move actually happened.
Returns:
True if move occurred,
False if the destination already matches source.
"""
ensure_file_exists(src)
if not os.path.exists(dest) or not files_match(dest, src):
if verbose:
print(f" === Updated: {dest}")
shutil.move(src, dest)
return True
else:
if verbose:
print(f" === Cached: {dest}")
os.remove(src)
return False
@dataclasses.dataclass
class TempFileTransform(object):
"""Represents a file name transform.
At least temp_dir or suffix or basename_prefix must be non-blank.
temp_dir: Write temporary files in here.
If blank, paths are relative to working directory.
suffix: Add this suffix to temporary files, e.g. ".tmp".
basename_prefix: Add this prefix to the basename of the path.
This can be a good choice over suffix when the underlying tool behavior
is sensitive to the output file extension.
Example: "foo/bar.txt", with prefix="tmp-" -> foo/tmp-bar.txt
"""
temp_dir: str = ""
suffix: str = ""
basename_prefix: str = ""
@property
def valid(self):
return self.temp_dir or self.suffix or self.basename_prefix
def transform(self, path: str) -> str:
return os.path.join(
self.temp_dir, os.path.dirname(path),
self.basename_prefix + os.path.basename(path) + self.suffix)
def split_transform_join(
token: str, sep: str, transform: Callable[[str], str]) -> str:
return sep.join(transform(x) for x in token.split(sep))
def lexically_rewrite_token(token: str, transform: Callable[[str], str]) -> str:
"""Lexically replaces substrings between delimiters.
This is useful for transforming substrings of text.
This can transform "--foo=bar,baz" into
f("--foo") + "=" + f("bar") + "," + f("baz")
Args:
token: text to transform, like a shell token.
transform: text transformation.
Returns:
text with substrings transformed.
"""
def inner_transform(text: str) -> str:
return split_transform_join(text, ",", transform)
return split_transform_join(token, "=", inner_transform)
class OutputSubstitution(object):
def __init__(self, spec: str):
"""Constructs an OutputSubstitution.
Args:
spec: a string that is either:
* a filename to substitute
* a specification of the form 'substitute_after:OPTION:FILENAME'
where OPTION is a flag to match, like '--output',
and FILENAME is the output file name to substitute.
File names may not contain the characters: =:,
See help for the --output option.
"""
if spec.startswith('substitute_after:'):
tokens = spec.split(':')
if len(tokens) != 3:
raise ValueError(
f'Expecting a substitution specification FILENAME or ' +
f'substitute_after:OPTION:FILENAME, but got {spec}.')
self.match_previous_option = tokens[1]
self.output_name = tokens[2]
else:
# if blank, this will not be used for matching
self.match_previous_option = ''
self.output_name = spec
@dataclasses.dataclass
class Action(object):
"""Represents a set of parameters of a single build action."""
command: Sequence[str] = dataclasses.field(default_factory=list)
substitutions: Dict[str, str] = dataclasses.field(
default_factory=dict) # FrozenDict
label: str = ""
def substitute_command(
self, tempfile_transform: TempFileTransform
) -> Tuple[Sequence[str], Dict[str, str]]:
# renamed_outputs: keys: original file names, values: transformed temporary file names
renamed_outputs = {}
def replace_output_filename(arg: str, prev_opt: str) -> str:
if arg in self.substitutions:
match_previous = self.substitutions[arg]
# Some output filenames requires the previous option to match.
if match_previous != '' and prev_opt != match_previous:
return arg
new_arg = tempfile_transform.transform(arg)
if arg != new_arg:
renamed_outputs[arg] = new_arg
return new_arg
else:
return arg
substituted_command = []
# Subprocess calls do not work for commands that start with VAR=VALUE
# environment variables, which is remedied by prefixing with 'env'.
if self.command and '=' in self.command[0]:
substituted_command += ['/usr/bin/env']
substituted_command += [
lexically_rewrite_token(
tok, lambda x: replace_output_filename(x, prev_opt))
for prev_opt, tok in zip([''] + self.command[:-1], self.command)
]
return substituted_command, renamed_outputs
def run_cached(
self,
tempfile_transform: TempFileTransform,
verbose: bool = False,
dry_run: bool = False) -> int:
"""Runs a modified command and conditionally moves outputs in-place.
Args:
tempfile_transform: describes transformation to temporary file name.
verbose: If True, print substituted command before running it.
dry_run: If True, print substituted command and stop.
"""
# renamed_outputs: keys: original file names, values: transformed temporary file names
substituted_command, renamed_outputs = self.substitute_command(
tempfile_transform)
if verbose or dry_run:
for orig, renamed in renamed_outputs.items():
print(f"=== renamed: {orig} -> {renamed}")
cmd_str = " ".join(substituted_command)
print(f"=== substituted command: {cmd_str}")
if dry_run:
return 0
# mkdir when needed.
if tempfile_transform.temp_dir:
for new_arg in renamed_outputs.values():
os.makedirs(os.path.dirname(new_arg), exist_ok=True)
# Run the modified command.
retval = subprocess.call(substituted_command)
if retval != 0:
# Option: clean-up .tmp files or leave them for inspection
return retval
# Otherwise command succeeded, so conditionally move outputs in-place.
# TODO(fangism): This loop could be parallelized.
move_err = False
for orig_out, temp_out in renamed_outputs.items():
try:
retry_file_op_once_with_delay(
lambda: move_if_different(
src=temp_out, dest=orig_out, verbose=verbose),
f'Failed to update {temp_out} -> {orig_out}.',
5,
)
except FileNotFoundError as e:
print(e)
move_err = True
if move_err:
print(" *** Aborting due to previous error.")
return 1
if verbose:
unrenamed_outputs = set(self.substitutions.keys()) - set(
renamed_outputs.keys())
if unrenamed_outputs:
# Having un-renamed outputs is not an error, but rather an indicator
# of a potentially missed opportunity to cache unchanged outputs.
unrenamed_formatted = " ".join(unrenamed_outputs)
print(f" === Un-renamed outputs: {unrenamed_formatted}")
return 0
def run_twice_and_compare_outputs(
self,
tempfile_transform: TempFileTransform,
verbose: bool = False) -> int:
"""Runs a command twice, copying declared outputs in between.
Compare both sets of outputs, and error out if any differ.
The advantage of this variant over others is that it eliminates
output-path sensitivities by running the *same* command twice.
One possible disadvantage is that this may expose behavioral
differences due to the non/pre-existence of outputs ahead of
running the command.
Args:
tempfile_transform: used to rename backup copies of outputs.
verbose: if True, print more diagnostics.
Returns:
exit code 0 on command success and outputs match, else nonzero.
"""
# Run the command the first time.
retval = subprocess.call(self.command)
# If the command failed, skip re-running.
if retval != 0:
return retval
# Backup a copy of all declared outputs.
renamed_outputs = {}
for out in self.substitutions:
# TODO(fangism): what do we do about symlinks?
# TODO(fangism): An output *directory* is unexpected, coming from GN,
# but has been observed. For now skip it.
if os.path.isfile(out):
renamed_outputs[out] = tempfile_transform.transform(out)
# A nonexistent output would be caught by action_tracer.py.
for out, backup in renamed_outputs.items():
if tempfile_transform.temp_dir:
os.makedirs(os.path.dirname(backup), exist_ok=True)
# preserve metadata such as timestamp
shutil.copy2(out, backup, follow_symlinks=False)
rerun_retval = subprocess.call(self.command)
if rerun_retval != 0:
print(
f"""Re-run of command {self.command} failed, while first time succeeded!?"""
)
return rerun_retval
return verify_files_match(fileset=renamed_outputs, label=self.label)
def run_twice_with_substitution_and_compare_outputs(
self,
tempfile_transform: TempFileTransform,
verbose: bool = False) -> int:
"""Runs a command twice, the second time with renamed outputs, and compares.
Caveat: If the contents if the outputs are sensitive to the names of the
outputs, this will find too many differences.
Args:
tempfile_transform: used to rename backup copies of outputs.
verbose: if True, print more diagnostics.
Returns:
exit code 0 on command success and outputs match, else nonzero.
"""
# renamed_outputs: keys: original file names, values: transformed temporary file names
substituted_command, renamed_outputs = self.substitute_command(
tempfile_transform)
if verbose:
cmd_str = " ".join(substituted_command)
print(f"=== substituted command: {cmd_str}")
# mkdir when needed.
if tempfile_transform.temp_dir:
for new_arg in renamed_outputs.values():
os.makedirs(os.path.dirname(new_arg), exist_ok=True)
# Run the original command.
retval = subprocess.call(self.command)
# If the command failed, skip re-running.
if retval != 0:
return retval
# Otherwise command succeeded, re-run with different output locations.
rerun_retval = subprocess.call(substituted_command)
if rerun_retval != 0:
print(
f"Re-run failed with substituted outputs of target [{self.label}]: {substituted_command}"
)
return rerun_retval
return verify_files_match(fileset=renamed_outputs, label=self.label)
def verify_files_match(fileset: Dict[str, str], label: str) -> int:
"""Compare outputs and report differences.
Remove matching copies.
Args:
fileset: {file: backup} key-value pairs of files to compare. Backup files
that match are removed to save space, while the .keys() files are kept.
label: An identifier for the action that was run, for diagnostics.
Returns:
exit code 0 if all files matched, else 1.
"""
matching_files, different_files = _partition(
fileset.items(),
# If either file is missing, this will fail, which indicates that
# something is not working as expected.
lambda pair: files_match(pair[0], pair[1]))
# Remove any files that matched to save space.
for _, temp_out in matching_files:
os.remove(temp_out)
if different_files:
print(
f"Repeating command for target [{label}] with renamed outputs produces different results:"
)
for orig, temp in different_files:
print(f" {orig} vs. {temp}")
# Keep around different outputs for analysis.
# Note: Even though the original command succeeded, forcing this to
# fail may influence tools that examine the freshness of outputs
# relative to the last succeeded command.
return 1
return 0
def main_arg_parser() -> argparse.ArgumentParser:
"""Construct the argument parser, called by main()."""
parser = argparse.ArgumentParser(
description="Wraps a GN action to preserve unchanged outputs",
argument_default=[],
)
# label is only used for diagnostics
parser.add_argument(
"--label",
type=str,
default="",
help="The wrapped target's label",
)
parser.add_argument(
"--outputs",
nargs="*",
help="An action's declared outputs. " +
"When an element is a plain file name, all occurrences of that file name "
+
"will be substituted in the command that writes temporary outputs. " +
"When an element has the form 'substitute_after:OPTION:FILENAME', " +
"only occurrences of FILENAME found in the option argument of OPTION " +
"will be substituted (examples: OPTION=-o or OPTION=--out). " +
"The latter form is recommended when an output filename can occur in " +
"multiple locations in a command line. " +
"File names must not contain =,: characters.",
)
parser.add_argument(
"--temp-suffix",
type=str,
default="",
help="Suffix to use for temporary outputs",
)
parser.add_argument(
"--temp-prefix",
type=str,
default="tmp-",
help="Basename prefix to use for temporary outputs",
)
parser.add_argument(
"--temp-dir",
type=str,
default="",
help=
"Temporary directory for writing, can be relative to working directory or absolute.",
)
parser.add_argument(
"--verbose",
action="store_true",
default=False,
help="Print information about which outputs were renamed/cached.",
)
parser.add_argument(
"--dry-run",
action="store_true",
default=False,
help="Show transformed command and exit.",
)
parser.add_argument(
"--disable",
action="store_false",
dest="enable",
default=True,
help="If disabled, run the original command as-is.",
)
parser.add_argument(
"--check-repeatability",
action="store_true",
default=False,
help=
"Check for repeatability: run the command twice, with different outputs, and compare.",
)
parser.add_argument(
"--rename-outputs",
action="store_true",
default=False,
help=
"When checking for repeatability: rename command-line outputs on the second run.",
)
# Positional args are the command and arguments to run.
parser.add_argument("command", nargs="*", help="The command to run")
return parser
def main():
parser = main_arg_parser()
args = parser.parse_args()
tempfile_transform = TempFileTransform(
temp_dir=args.temp_dir,
suffix=args.temp_suffix,
basename_prefix=args.temp_prefix,
)
if not tempfile_transform.valid:
raise ValueError(
"Need either --temp-dir or --temp-suffix, but both are missing.")
wrap = args.enable
# Decided whether or not to wrap the action script.
ignored_scripts = {
# If the action is only copying or linking, don't bother wrapping.
"ln",
"cp", # TODO: Could conditionally copy if different.
"rsync",
}
script = args.command[0]
if os.path.basename(script) in ignored_scripts:
wrap = False
# If disabled, run the original command as-is.
if not wrap:
return subprocess.call(args.command)
# Otherwise, rewrite the command using temporary outputs.
outputs = set(args.outputs)
try:
substitutions = [OutputSubstitution(x) for x in args.outputs]
except ValueError as e:
print(str(e))
return 1
substitutions_dict = {
x.output_name: x.match_previous_option for x in substitutions
}
# Run a modified command that can leave unchanged outputs untouched.
action = Action(
command=args.command,
substitutions=substitutions_dict,
label=args.label,
)
# Run one of the following modes:
# check_repeatability: run the command twice, and compare the outputs.
# [default]: redirect outputs to temporary locations, and move them
# in-place to their original locations if contents have not changed.
if args.check_repeatability:
if args.rename_outputs:
# This check variant will find path-sensitive outputs,
# and nondeterminstic outputs.
return action.run_twice_with_substitution_and_compare_outputs(
tempfile_transform=tempfile_transform, verbose=args.verbose)
else:
# This check will only find nondeterministic outputs.
# For example, those affected by the current time.
return action.run_twice_and_compare_outputs(
tempfile_transform=tempfile_transform, verbose=args.verbose)
return action.run_cached(
tempfile_transform=tempfile_transform,
verbose=args.verbose,
dry_run=args.dry_run,
)
if __name__ == "__main__":
sys.exit(main())