[mypyc] Support incremental compilation This works by reworking IR generation to proceed a SCC at a time and writing out caches of serialized IR information so that we can generated code that calls into a module without compiling the module in full. A mypy plugin is used to ensure cache validity by checking that a hash of the metadata matches and that all of the generated source is present and matches. Closes mypyc/mypyc#682.
diff --git a/mypyc/build.py b/mypyc/build.py index b187645..fad0995 100644 --- a/mypyc/build.py +++ b/mypyc/build.py
@@ -37,7 +37,7 @@ from mypyc.namegen import exported_name from mypyc.options import CompilerOptions from mypyc.errors import Errors -from mypyc.common import shared_lib_name +from mypyc.common import BUILD_DIR, shared_lib_name from mypyc.ops import format_modules from mypyc import emitmodule @@ -79,7 +79,8 @@ def get_mypy_config(paths: List[str], - mypy_options: Optional[List[str]]) -> Tuple[List[BuildSource], Options]: + mypy_options: Optional[List[str]], + compiler_options: CompilerOptions) -> Tuple[List[BuildSource], Options]: """Construct mypy BuildSources and Options from file and options lists""" # It is kind of silly to do this but oh well mypy_options = mypy_options or [] @@ -99,8 +100,7 @@ options.show_traceback = True # Needed to get types for all AST nodes options.export_types = True - # TODO: Support incremental checking - options.incremental = False + options.incremental = compiler_options.separate options.preserve_asts = True for source in sources: @@ -184,7 +184,7 @@ # Do the actual work now t0 = time.time() try: - result = emitmodule.parse_and_typecheck(sources, options) + result = emitmodule.parse_and_typecheck(sources, options, groups) except CompileError as e: for line in e.messages: print(line) @@ -283,14 +283,17 @@ want to write, skip writing so as to preserve the mtime and avoid triggering recompilation. """ + # We encode it ourselves and open the files as binary to avoid windows + # newline translation + encoded_contents = contents.encode('utf-8') try: - with open(path, 'r', encoding='utf-8') as f: - old_contents = f.read() # type: Optional[str] + with open(path, 'rb') as f: + old_contents = f.read() # type: Optional[bytes] except IOError: old_contents = None - if old_contents != contents: - with open(path, 'w', encoding='utf-8') as f: - f.write(contents) + if old_contents != encoded_contents: + with open(path, 'wb') as f: + f.write(encoded_contents) # Fudge the mtime forward because otherwise when two builds happen close # together (like in a test) setuptools might not realize the source is newer @@ -400,8 +403,12 @@ """ setup_mypycify_vars() - compiler_options = CompilerOptions(strip_asserts=strip_asserts, - multi_file=multi_file, verbose=verbose) + compiler_options = CompilerOptions( + strip_asserts=strip_asserts, + multi_file=multi_file, + verbose=verbose, + separate=separate is not False, + ) # Create a compiler object so we can make decisions based on what # compiler is being used. typeshed is missing some attribues on the @@ -413,13 +420,13 @@ for path in paths: expanded_paths.extend(glob.glob(path)) - build_dir = 'build' # TODO: can this be overridden?? + build_dir = BUILD_DIR # TODO: can this be overridden?? try: os.mkdir(build_dir) except FileExistsError: pass - sources, options = get_mypy_config(expanded_paths, mypy_options) + sources, options = get_mypy_config(expanded_paths, mypy_options, compiler_options) # We generate a shared lib if there are multiple modules or if any # of the modules are in package. (Because I didn't want to fuss # around with making the single module code handle packages.)
diff --git a/mypyc/common.py b/mypyc/common.py index efd4c34..07d15ca 100644 --- a/mypyc/common.py +++ b/mypyc/common.py
@@ -2,6 +2,8 @@ if MYPY: from typing_extensions import Final +BUILD_DIR = 'build' + PREFIX = 'CPyPy_' # type: Final # Python wrappers NATIVE_PREFIX = 'CPyDef_' # type: Final # Native functions etc. DUNDER_PREFIX = 'CPyDunder_' # type: Final # Wrappers for exposing dunder methods to the API
diff --git a/mypyc/emitmodule.py b/mypyc/emitmodule.py index 9a52f9d..84c425d 100644 --- a/mypyc/emitmodule.py +++ b/mypyc/emitmodule.py
@@ -3,15 +3,25 @@ # FIXME: Basically nothing in this file operates on the level of a # single module and it should be renamed. +import os +import hashlib +import json from collections import OrderedDict from typing import List, Tuple, Dict, Iterable, Set, TypeVar, Optional -from mypy.build import BuildSource, BuildResult, build +from mypy.nodes import MypyFile +from mypy.build import ( + BuildSource, BuildResult, State, build, sorted_components, get_cache_names, + create_metastore, compute_hash, +) from mypy.errors import CompileError from mypy.options import Options +from mypy.plugin import Plugin, ReportConfigContext from mypyc import genops -from mypyc.common import PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, shared_lib_name +from mypyc.common import ( + BUILD_DIR, PREFIX, TOP_LEVEL_NAME, INT_PREFIX, MODULE_PREFIX, shared_lib_name, +) from mypyc.emit import EmitterContext, Emitter, HeaderDeclaration from mypyc.emitfunc import generate_native_function, native_function_header from mypyc.emitclass import generate_class_type_decl, generate_class @@ -19,7 +29,8 @@ generate_wrapper_function, wrapper_function_header, ) from mypyc.ops import ( - FuncIR, ClassIR, ModuleIR, ModuleIRs, LiteralsMap, RType, RTuple + FuncIR, ClassIR, ModuleIR, ModuleIRs, LiteralsMap, RType, RTuple, + DeserMaps, deserialize_modules, ) from mypyc.options import CompilerOptions from mypyc.uninit import insert_uninit_checks @@ -63,17 +74,266 @@ self.mark = False +class MypycPlugin(Plugin): + """Plugin for making mypyc interoperate properly with mypy incremental mode. + + Basically the point of this plugin is to force mypy to recheck things + based on the demands of mypyc in a couple situations: + * Any modules in the same group must be compiled together, so we + tell mypy that modules depend on all their groupmates. + * If the IR metadata is missing or stale or any of the generated + C source files associated missing or stale, then we need to + recompile the module so we mark it as stale. + """ + + def __init__(self, options: Options, groups: Groups) -> None: + super().__init__(options) + self.group_map = {} # type: Dict[str, Tuple[Optional[str], List[str]]] + for sources, name in groups: + modules = sorted(source.module for source in sources) + for id in modules: + self.group_map[id] = (name, modules) + + self.metastore = create_metastore(options) + + def report_config_data( + self, ctx: ReportConfigContext) -> Optional[Tuple[Optional[str], List[str]]]: + # The config data we report is the group map entry for the module. + # If the data is being used to check validity, we do additional checks + # that the IR cache exists and matches the metadata cache and all + # output source files exist and are up to date. + + id, path, is_check = ctx.id, ctx.path, ctx.is_check + + if id not in self.group_map: + return None + + # If we aren't doing validity checks, just return the cache data + if not is_check: + return self.group_map[id] + + # Load the metadata and IR cache + meta_path, _, _ = get_cache_names(id, path, self.options) + ir_path = get_ir_cache_name(id, path, self.options) + try: + meta_json = self.metastore.read(meta_path) + ir_json = self.metastore.read(ir_path) + except FileNotFoundError: + return None + + ir_data = json.loads(ir_json) + + # Check that the IR cache matches the metadata cache + if compute_hash(meta_json) != ir_data['meta_hash']: + return None + + # Check that all of the source files are present and as expected + for path, hash in ir_data['src_hashes'].items(): + try: + with open(os.path.join(BUILD_DIR, path), 'rb') as f: + contents = f.read() + except FileNotFoundError: + return None + real_hash = hashlib.md5(contents).hexdigest() + if hash != real_hash: + return None + + return self.group_map[id] + + def get_additional_deps(self, file: MypyFile) -> List[Tuple[int, str, int]]: + # Report dependency on modules in the module's group + return [(10, id, -1) for id in self.group_map.get(file.fullname(), (None, []))[1]] + + def parse_and_typecheck(sources: List[BuildSource], options: Options, + groups: Groups, alt_lib_path: Optional[str] = None) -> BuildResult: assert options.strict_optional, 'strict_optional must be turned on' result = build(sources=sources, options=options, - alt_lib_path=alt_lib_path) + alt_lib_path=alt_lib_path, + extra_plugins=[MypycPlugin(options, groups)]) if result.errors: raise CompileError(result.errors) return result +def compile_scc_to_ir( + scc: List[MypyFile], + result: BuildResult, + mapper: genops.Mapper, + compiler_options: CompilerOptions, + errors: Errors, +) -> ModuleIRs: + """Compile an SCC into ModuleIRs. + + Any modules that this SCC depends on must have either compiled or + loaded from a cache into mapper. + + Arguments: + scc: The list of MypyFiles to compile + result: The BuildResult from the mypy front-end + mapper: The Mapper object mapping mypy ASTs to class and func IRs + compiler_options: The compilation options + errors: Where to report any errors encountered + + Returns the IR of the modules. + """ + + if compiler_options.verbose: + print("Compiling {}".format(", ".join(x.name() for x in scc))) + + # Generate basic IR, with missing exception and refcount handling. + modules = genops.build_ir( + scc, result.graph, result.types, mapper, compiler_options, errors + ) + if errors.num_errors > 0: + return modules + + # Insert uninit checks. + for module in modules.values(): + for fn in module.functions: + insert_uninit_checks(fn) + # Insert exception handling. + for module in modules.values(): + for fn in module.functions: + insert_exception_handling(fn) + # Insert refcount handling. + for module in modules.values(): + for fn in module.functions: + insert_ref_count_opcodes(fn) + + return modules + + +def compile_modules_to_ir( + result: BuildResult, + mapper: genops.Mapper, + compiler_options: CompilerOptions, + errors: Errors, +) -> ModuleIRs: + """Compile a collection of modules into ModuleIRs. + + The modules to compile are specified as part of mapper's group_map. + + Returns the IR of the modules. + """ + deser_ctx = DeserMaps({}, {}) + modules = {} + + # Process the graph by SCC in topological order, like we do in mypy.build + for scc in sorted_components(result.graph): + scc_states = [result.graph[id] for id in scc] + trees = [st.tree for st in scc_states if st.id in mapper.group_map and st.tree] + + if not trees: + continue + + fresh = all(id not in result.manager.rechecked_modules for id in scc) + if fresh: + load_scc_from_cache(trees, result, mapper, deser_ctx) + else: + scc_ir = compile_scc_to_ir(trees, result, mapper, compiler_options, errors) + modules.update(scc_ir) + + return modules + + +def compile_ir_to_c( + groups: Groups, + modules: ModuleIRs, + result: BuildResult, + mapper: genops.Mapper, + compiler_options: CompilerOptions, +) -> Dict[Optional[str], List[Tuple[str, str]]]: + """Compile a collection of ModuleIRs to C source text. + + Returns a dictionary mapping group names to a list of (file name, + file text) pairs. + """ + source_paths = {source.module: result.graph[source.module].xpath + for sources, _ in groups for source in sources} + + names = NameGenerator([[source.module for source in sources] for sources, _ in groups]) + + # Generate C code for each compilation group. Each group will be + # compiled into a separate extension module. + ctext = {} # type: Dict[Optional[str], List[Tuple[str, str]]] + for group_sources, group_name in groups: + group_modules = [(source.module, modules[source.module]) for source in group_sources + if source.module in modules] + if not group_modules: + ctext[group_name] = [] + continue + literals = mapper.literals[group_name] + generator = GroupGenerator( + literals, group_modules, source_paths, group_name, mapper.group_map, names, + compiler_options.multi_file + ) + ctext[group_name] = generator.generate_c_for_modules() + + return ctext + + +def get_ir_cache_name(id: str, path: str, options: Options) -> str: + meta_path, _, _ = get_cache_names(id, path, options) + return meta_path.replace('.meta.json', '.ir.json') + + +def get_state_ir_cache_name(state: State) -> str: + return get_ir_cache_name(state.id, state.xpath, state.options) + + +def write_cache( + modules: ModuleIRs, + result: BuildResult, + group_map: Dict[str, Optional[str]], + ctext: Dict[Optional[str], List[Tuple[str, str]]], +) -> None: + """Write out the cache information for modules.""" + + hashes = {} + for name, files in ctext.items(): + hashes[name] = {file: compute_hash(data) for file, data in files} + + # Write out cache data + for id, module in modules.items(): + st = result.graph[id] + + meta_path, _, _ = get_cache_names(id, st.xpath, result.manager.options) + + newpath = get_state_ir_cache_name(st) + ir_data = { + 'ir': module.serialize(), + 'meta_hash': compute_hash(result.manager.metastore.read(meta_path)), + 'src_hashes': hashes[group_map[id]], + } + + result.manager.metastore.write(newpath, json.dumps(ir_data)) + + result.manager.metastore.commit() + + +def load_scc_from_cache( + scc: List[MypyFile], + result: BuildResult, + mapper: genops.Mapper, + ctx: DeserMaps, +) -> ModuleIRs: + """Load IR for an SCC of modules from the cache. + + Arguments and return are as compile_scc_to_ir. + """ + cache_data = { + k.fullname(): json.loads( + result.manager.metastore.read(get_state_ir_cache_name(result.graph[k.fullname()])) + )['ir'] for k in scc + } + modules = deserialize_modules(cache_data, ctx) + genops.load_type_map(mapper, scc, ctx) + return modules + + def compile_modules_to_c( result: BuildResult, compiler_options: CompilerOptions, @@ -98,52 +358,17 @@ Returns the IR of the modules and a list containing the generated files for each group. """ - module_names = [source.module for group_sources, _ in groups for source in group_sources] - file_nodes = [result.files[name] for name in module_names] - # Construct a map from modules to what group they belong to - group_map = {} - for group, lib_name in groups: - for source in group: - group_map[source.module] = lib_name - - # Generate basic IR, with missing exception and refcount handling. + group_map = {source.module: lib_name for group, lib_name in groups for source in group} mapper = genops.Mapper(group_map) - modules = genops.build_ir(file_nodes, result.graph, result.types, - mapper, - compiler_options, errors) - if errors.num_errors > 0: - return modules, [] - # Insert uninit checks. - for module in modules.values(): - for fn in module.functions: - insert_uninit_checks(fn) - # Insert exception handling. - for module in modules.values(): - for fn in module.functions: - insert_exception_handling(fn) - # Insert refcount handling. - for module in modules.values(): - for fn in module.functions: - insert_ref_count_opcodes(fn) - source_paths = {module_name: result.files[module_name].path - for module_name in module_names} + modules = compile_modules_to_ir(result, mapper, compiler_options, errors) + ctext = compile_ir_to_c(groups, modules, result, mapper, compiler_options) - names = NameGenerator([[source.module for source in sources] for sources, _ in groups]) + if errors.num_errors == 0: + write_cache(modules, result, group_map, ctext) - # Generate C code for each compilation group. Each group will be - # compiled into a separate extension module. - ctext = [] - for group_sources, group_name in groups: - group_modules = [(source.module, modules[source.module]) for source in group_sources] - literals = mapper.literals[group_name] - generator = GroupGenerator( - literals, group_modules, source_paths, group_name, group_map, names, - compiler_options.multi_file - ) - ctext.append(generator.generate_c_for_modules()) - return modules, ctext + return modules, [ctext[name] for _, name in groups] def generate_function_declaration(fn: FuncIR, emitter: Emitter) -> None:
diff --git a/mypyc/genops.py b/mypyc/genops.py index 06976a6..bb72345 100644 --- a/mypyc/genops.py +++ b/mypyc/genops.py
@@ -14,7 +14,7 @@ return r3 """ from typing import ( - TypeVar, Callable, Dict, List, Tuple, Optional, Union, Sequence, Set, Any, cast + TypeVar, Callable, Dict, List, Tuple, Optional, Union, Sequence, Set, Any, Iterable, cast ) from typing_extensions import overload, NoReturn from collections import OrderedDict @@ -68,7 +68,8 @@ NAMESPACE_TYPE, NAMESPACE_MODULE, RaiseStandardError, LoadErrorValue, NO_TRACEBACK_LINE_NO, FuncDecl, FUNC_NORMAL, FUNC_STATICMETHOD, FUNC_CLASSMETHOD, - RUnion, is_optional_type, optional_value_type, all_concrete_classes + RUnion, is_optional_type, optional_value_type, all_concrete_classes, + DeserMaps, ) from mypyc.ops_primitive import binary_ops, unary_ops, func_ops, method_ops, name_ref_ops from mypyc.ops_list import ( @@ -153,16 +154,25 @@ # so that we can easily pick out the right copy of a function that # is conditionally defined. for module in modules: - for name, node in module.names.items(): - # We need to filter out functions that are imported or - # aliases. The best way to do this seems to be by - # checking that the fullname matches. - if (isinstance(node.node, (FuncDef, Decorator, OverloadedFuncDef)) - and node.fullname == module.fullname() + '.' + name): - prepare_func_def(module.fullname(), None, get_func_def(node.node), mapper) + for func in get_module_func_defs(module): + prepare_func_def(module.fullname(), None, func, mapper) # TODO: what else? +def load_type_map(mapper: 'Mapper', + modules: List[MypyFile], + deser_ctx: DeserMaps) -> None: + """Populate a Mapper with deserialized IR from a list of modules.""" + for module in modules: + for node in module.defs: + if isinstance(node, ClassDef): + mapper.type_to_ir[node.info] = deser_ctx.classes[node.fullname] + + for module in modules: + for func in get_module_func_defs(module): + mapper.func_to_decl[func] = deser_ctx.functions[func.fullname()].decl + + @strict_optional_dec # Turn on strict optional for any type manipulations we do def build_ir(modules: List[MypyFile], graph: Graph, @@ -176,7 +186,6 @@ result = OrderedDict() # type: ModuleIRs # Generate IR for all modules. - module_names = [mod.fullname() for mod in modules] class_irs = [] for module in modules: @@ -186,7 +195,7 @@ # Second pass. builder = IRBuilder( - module.fullname(), types, graph, errors, mapper, module_names, pbv, options + module.fullname(), types, graph, errors, mapper, pbv, options ) builder.visit_mypy_file(module) module_ir = ModuleIR( @@ -283,6 +292,17 @@ return op +def get_module_func_defs(module: MypyFile) -> Iterable[FuncDef]: + """Collect all of the (non-method) functions declared in a module.""" + for name, node in module.names.items(): + # We need to filter out functions that are imported or + # aliases. The best way to do this seems to be by + # checking that the fullname matches. + if (isinstance(node.node, (FuncDef, Decorator, OverloadedFuncDef)) + and node.fullname == module.fullname() + '.' + name): + yield get_func_def(node.node) + + def specialize_parent_vtable(cls: ClassIR, parent: ClassIR) -> VTableEntries: """Generate the part of a vtable corresponding to a parent class or trait""" updated = [] @@ -1049,7 +1069,6 @@ graph: Graph, errors: Errors, mapper: Mapper, - modules: List[str], pbv: PreBuildVisitor, options: CompilerOptions) -> None: self.current_module = current_module @@ -1062,7 +1081,6 @@ self.functions = [] # type: List[FuncIR] self.classes = [] # type: List[ClassIR] self.final_names = [] # type: List[Tuple[str, RType]] - self.modules = set(modules) self.callable_class_names = set() # type: Set[str] self.options = options @@ -2851,11 +2869,14 @@ value = bytes(expr.value, 'utf8').decode('unicode-escape').encode('raw-unicode-escape') return self.load_static_bytes(value) + def is_native_module(self, module: str) -> bool: + return module in self.mapper.group_map + def is_native_ref_expr(self, expr: RefExpr) -> bool: if expr.node is None: return False if '.' in expr.node.fullname(): - return expr.node.fullname().rpartition('.')[0] in self.modules + return self.is_native_module(expr.node.fullname().rpartition('.')[0]) return True def is_native_module_ref_expr(self, expr: RefExpr) -> bool: @@ -2892,7 +2913,7 @@ if is_final: final_var = sym.node fullname = '{}.{}'.format(sym.node.info.fullname(), final_var.name()) - native = expr.expr.node.module_name in self.modules + native = self.is_native_module(expr.expr.node.module_name) elif self.is_module_member_expr(expr): # a module attribute if isinstance(expr.node, Var) and expr.node.is_final:
diff --git a/mypyc/test-data/run-multimodule.test b/mypyc/test-data/run-multimodule.test index 316ee49..65af40f 100644 --- a/mypyc/test-data/run-multimodule.test +++ b/mypyc/test-data/run-multimodule.test
@@ -554,4 +554,180 @@ 2 [out2] 3 -[rechecked other] +[rechecked other, other_b] + +[case testIncrementalCompilation1] +from other_a import A +from other_b import z + +a = A() +assert a.y == z + +[file other_a.py] +from other_b import z + +class A: + def __init__(self) -> None: + self.y = z +[file other_a.py.2] +from other_b import z + +class A: + def __init__(self) -> None: + self.x = 'test' + self.y = z +[file other_b.py] +import other_a + +z = 10 + +def foo() -> 'other_a.A': + return other_a.A() +[file other_b.py.3] +import other_a + +z = 20 + +def foo() -> 'other_a.A': + return other_a.A() + +[file driver.py] +from native import a +print(a.y, getattr(a, 'x', None)) + +[out] +10 None +[out2] +10 test +[out3] +20 test + +[rechecked other_a, other_b, native] +[rechecked2 other_a, other_b] + + +-- This one tests a group that is not an SCC. +[case testIncrementalCompilation2] +# separate: [(["other_a.py", "other_b.py"], "stuff")] +from other_a import A +from other_b import z + +a = A() +assert a.y == z + +[file other_a.py] +from other_b import z + +class A: + def __init__(self) -> None: + self.y = z +[file other_a.py.2] +from other_b import z + +class A: + def __init__(self) -> None: + self.x = 'test' + self.y = z + +[file other_b.py] +z = 10 + +[file driver.py] +from native import a +print(a.y, getattr(a, 'x', None)) + +[out] +10 None +[out2] +10 test + +[rechecked other_a, other_b, native] + +[case testIncrementalCompilation3] +from other import X +Y = X +def foo() -> int: + return X + +[file other.py] +from typing_extensions import Final +X: Final = 10 + +[file other.py.2] +from typing_extensions import Final +X: Final = 20 + +[file driver.py] +import native +import other +assert native.Y == other.X +assert native.foo() == other.X + +[rechecked native, other] + +-- This one tests a group changing +[case testIncrementalCompilation4] +# separate: [(["other_a.py", "other_b.py"], "stuff")] +# separate2: [] +from other_a import A +from other_b import z + +a = A() +assert a.y == z + +[file other_a.py] +from other_b import z + +class A: + def __init__(self) -> None: + self.y = z + +[file other_b.py] +z = 10 + +[file wtvr.py.2] + +[file driver.py] +from native import a +print(a.y, getattr(a, 'x', None)) + +[out] +10 None +[out2] +10 None + +[rechecked other_a, other_b, native] + +-- This one tests cases where other modules *do not* need rechecked +[case testIncrementalCompilation5] +import other_a +[file other_a.py] +from other_b import f +assert f(10) == 20 +[file other_a.py.2] +from other_b import f +assert f(20) == 40 + +[file other_b.py] +def f(x: int) -> int: + return x * 2 + +[file driver.py] +import native + +[rechecked other_a] + +-- Delete one of the C files and make sure this forces recompilation +[case testIncrementalCompilation6] +import other_a +assert other_a.foo() == 10 +[file other_a.py] +def foo() -> int: return 10 + + +[delete build/__native_other_a.c.2] + +[file driver.py] +import native + +[rechecked native, other_a]
diff --git a/mypyc/test/test_run.py b/mypyc/test/test_run.py index 2ed6a89..c88aeac 100644 --- a/mypyc/test/test_run.py +++ b/mypyc/test/test_run.py
@@ -162,7 +162,7 @@ options.python_version = max(sys.version_info[:2], (3, 6)) options.export_types = True options.preserve_asts = True - options.incremental = False + options.incremental = self.separate # Avoid checking modules/packages named 'unchecked', to provide a way # to test interacting with code we don't have types for. @@ -200,6 +200,7 @@ result = emitmodule.parse_and_typecheck( sources=sources, options=options, + groups=groups, alt_lib_path='.') errors = Errors() compiler_options = CompilerOptions(multi_file=self.multi_file, separate=self.separate)