[mypyc] Enable incremental self-compilation (#21369)

Six fixes on top of #21299, all required to self-compile mypy or to
install a `separate=True` wheel via pip.

- `mypyc/build.py`: pip invokes `setup.py` twice when building a wheel.
On the second invocation mypy's incremental cache is fully warm, so we
generate no new C source for any group; the resulting extensions ship
without their entry points and import as stubs.
- **Fix**: when a group emits no C source, reuse the .c file from the
previous pass.

- `mypyc/codegen/{emit,emitfunc}.py`: when code in one compiled group
reads an attribute on an object whose class lives in another group, the
generated cast depends on that other group's struct definitions. We
weren't recording the dependency, so the C compiler couldn't see the
layout and the build failed.
  - **Fix**: register the dependency at the cast site.

- `mypyc/codegen/emitmodule.py`: when mypy compiles itself, a generated
shim file can share a basename with a runtime C file. The C compiler
resolves the runtime include relative to the shim's directory and picks
up the shim instead.
- **Fix**: search the include path explicitly so shims can't shadow
runtime files.

- `mypyc/lib-rt/misc_ops.c`: each compiled module gets its own shared
library next to it in the package tree. The runtime was computing the
module's file path as if a single shared library sat above the whole
package, which doubled the package prefix and broke submodule lookups.
- **Fix**: detect the per-module case and use only the module's leaf
name.

- `mypyc/irbuild/prepare.py`: traits and builtin-derived classes don't
get a real C constructor emitted. A clean build sidesteps that, but a
fully cached rebuild was taking the direct-call path and producing C
that referenced a constructor that doesn't exist.
  - **Fix**: skip the registration the same way a clean build does.

- `mypyc/build.py`: on every build_ext, setuptools rewrites every
compiled .so in the source tree even when nothing changed. On macOS this
invalidates the OS signature cache, so every import on the next run pays
a re-verification cost.
- **Fix**: skip the copy when source and destination already match,
taking a 1-line edit rebuild from ~72s to ~6s. This is really a `setup`
tools limitation though (relevant [mypy
issue](https://github.com/mypyc/mypyc/issues/1068) ?)

I also added a `MYPYC_SEPARATE` env knob so CI can exercise the codegen
path against mypy itself.


## Benchmarks

Mypy self-compile on macOS, `MYPYC_OPT_LEVEL=0`, `-j 11`. Three
scenarios:

| | monolithic | separate=True |
|:---:|:---:|:---:|
| Clean build | 180s | 108s |
| No-op rebuild | 124s | 5s |
| 1-line edit | 106s | 6s |
diff --git a/mypyc/build.py b/mypyc/build.py
index 08eeb13..8463308 100644
--- a/mypyc/build.py
+++ b/mypyc/build.py
@@ -450,6 +450,70 @@
         os.utime(path, times=(new_mtime, new_mtime))
 
 
+_MYPYC_EXTENSION_MARKER = "_mypyc_skip_redundant_inplace_copy"
+_setuptools_patch_applied = False
+
+
+def _patch_setuptools_copy_extensions_to_source() -> None:
+    """Skip redundant `.so` copies for extensions we generated.
+
+    setuptools' copy_extensions_to_source rewrites every `.so` in the
+    source tree on every build_ext, even when nothing changed. On macOS
+    this invalidates AMFI's signature cache (~100 ms re-verification per
+    `.so` on the next import), eating most of the separate=True
+    incremental speedup.
+
+    The patch is global because copy_extensions_to_source runs during
+    setup()'s build_ext command, after mypycify() has already returned;
+    we can't scope a context manager around it. Instead the skip only
+    fires for extensions tagged by mypycify (via the marker attribute),
+    so other setuptools users in the same setup.py see the unmodified
+    upstream behavior, including stub writes.
+    """
+    global _setuptools_patch_applied
+    if _setuptools_patch_applied:
+        return
+    _setuptools_patch_applied = True
+
+    from setuptools.command.build_ext import build_ext as _build_ext
+
+    original = _build_ext.copy_extensions_to_source
+
+    def _files_match(a: str, b: str) -> bool:
+        try:
+            sa = os.stat(a)
+            sb = os.stat(b)
+        except OSError:
+            return False
+        # Compare size + whole-second mtime. distutils' copy_file
+        # propagates the source mtime, but macOS drops sub-second
+        # precision on write so the float values never match verbatim.
+        return sa.st_size == sb.st_size and int(sa.st_mtime) == int(sb.st_mtime)
+
+    def patched(self: Any) -> None:
+        build_py = self.get_finalized_command("build_py")
+
+        def is_redundant(ext: Any) -> bool:
+            if not getattr(ext, _MYPYC_EXTENSION_MARKER, False):
+                return False
+            inplace_file, regular_file = self._get_inplace_equivalent(build_py, ext)
+            return _files_match(regular_file, inplace_file)
+
+        # Hide our already-fresh extensions from setuptools' loop and
+        # let it handle whatever's left. Delegating instead of
+        # reimplementing the body means future setuptools changes carry
+        # over for free. self.extensions is restored before we return
+        # so anything that inspects it later sees the original list.
+        saved = self.extensions
+        self.extensions = [ext for ext in saved if not is_redundant(ext)]
+        try:
+            original(self)
+        finally:
+            self.extensions = saved
+
+    _build_ext.copy_extensions_to_source = patched  # type: ignore[method-assign]
+
+
 def construct_groups(
     sources: list[BuildSource],
     separate: bool | list[tuple[list[str], str | None]],
@@ -513,7 +577,7 @@
     """
     headers: set[str] = set()
     for _, contents in cfiles:
-        headers.update(re.findall(r'#include "(.*)"', contents))
+        headers.update(re.findall(r'#include [<"]([^>"]+)[>"]', contents))
 
     return sorted(headers)
 
@@ -573,12 +637,21 @@
         cfilenames = []
         for cfile, ctext in cfiles:
             cfile = os.path.join(compiler_options.target_dir, cfile)
-            if not options.mypyc_skip_c_generation:
+            # Empty contents marks a file the previous run already wrote
+            # (fully-cached group): skip the rewrite and just reuse it.
+            if ctext and not options.mypyc_skip_c_generation:
                 write_file(cfile, ctext)
             if os.path.splitext(cfile)[1] == ".c":
                 cfilenames.append(cfile)
 
-        deps = [os.path.join(compiler_options.target_dir, dep) for dep in get_header_deps(cfiles)]
+        # The header regex matches both quote styles, so the result can
+        # include system headers like `<Python.h>` that don't live under
+        # target_dir. Joining those produces non-existent paths which
+        # would force a full rebuild on every run via Extension.depends.
+        candidate_deps = (
+            os.path.join(compiler_options.target_dir, dep) for dep in get_header_deps(cfiles)
+        )
+        deps = [d for d in candidate_deps if os.path.exists(d)]
         group_cfilenames.append((cfilenames, deps))
 
     return groups, group_cfilenames, source_deps
@@ -755,6 +828,9 @@
                                have no backward compatibility guarantees!
     """
 
+    # Skip redundant inplace .so copies on every build_ext invocation.
+    _patch_setuptools_copy_extensions_to_source()
+
     # Figure out our configuration
     compiler_options = CompilerOptions(
         strip_asserts=strip_asserts,
@@ -869,4 +945,9 @@
                 )
             )
 
+    # Tag every extension we own so the build_ext patch knows it's
+    # safe to skip the redundant inplace copy for these specifically.
+    for ext in extensions:
+        setattr(ext, _MYPYC_EXTENSION_MARKER, True)
+
     return extensions
diff --git a/mypyc/codegen/emit.py b/mypyc/codegen/emit.py
index 45ff34a..01cf359 100644
--- a/mypyc/codegen/emit.py
+++ b/mypyc/codegen/emit.py
@@ -326,6 +326,18 @@
         # See docs above
         return self.get_module_group_prefix(obj.module_name)
 
+    def register_group_dep(self, cl: ClassIR) -> None:
+        """Record `cl`'s defining group as a cross-group dep, if any.
+
+        Call this when emitting code that refers to `cl`'s struct
+        layout: the .c file consuming that layout needs the defining
+        group's `__native_*.h` included, and group_deps drives which
+        headers get pulled in.
+        """
+        target_group = self.context.group_map.get(cl.module_name)
+        if target_group and target_group != self.context.group_name:
+            self.context.group_deps.add(target_group)
+
     def static_name(self, id: str, module: str | None, prefix: str = STATIC_PREFIX) -> str:
         """Create name of a C static variable.
 
diff --git a/mypyc/codegen/emitfunc.py b/mypyc/codegen/emitfunc.py
index e4a8922..dcb606f 100644
--- a/mypyc/codegen/emitfunc.py
+++ b/mypyc/codegen/emitfunc.py
@@ -360,6 +360,11 @@
         classes, and *(obj + attr_offset) for attributes defined by traits. We also
         insert all necessary C casts here.
         """
+        # The struct cast below needs the defining group's __native.h
+        # included by the consuming .c file. Record both the receiver
+        # and declaring classes as cross-group deps.
+        self.emitter.register_group_dep(op.class_type.class_ir)
+        self.emitter.register_group_dep(decl_cl)
         cast = f"({op.class_type.struct_name(self.emitter.names)} *)"
         if decl_cl.is_trait and op.class_type.class_ir.is_trait:
             # For pure trait access find the offset first, offsets
diff --git a/mypyc/codegen/emitmodule.py b/mypyc/codegen/emitmodule.py
index 3f10df7..fa0a438 100644
--- a/mypyc/codegen/emitmodule.py
+++ b/mypyc/codegen/emitmodule.py
@@ -363,7 +363,12 @@
             if source.module in modules
         }
         if not group_modules:
-            ctext[group_name] = []
+            # Fully-cached group (e.g. pip's second setup.py invoke for
+            # the wheel phase): no fresh IR was produced. Reuse the file
+            # list recorded in any module's IR cache so the linker still
+            # sees the previous run's outputs; empty content is a "do
+            # not rewrite" sentinel for mypyc_build.
+            ctext[group_name] = _load_cached_group_files(group_sources, result)
             continue
         generator = GroupGenerator(
             group_modules, source_paths, group_name, mapper.group_map, names, compiler_options
@@ -373,6 +378,32 @@
     return ctext
 
 
+def _load_cached_group_files(
+    group_sources: list[BuildSource], result: BuildResult
+) -> list[tuple[str, str]]:
+    """Read the .c/.h paths recorded for this group on the previous run.
+
+    All modules in a group share the same src_hashes map, so the first
+    readable IR cache is sufficient. Returns paths paired with empty
+    content so callers can distinguish "reuse on disk" from "newly
+    generated".
+    """
+    for source in group_sources:
+        state = result.graph.get(source.module)
+        if state is None:
+            continue
+        try:
+            ir_json = result.manager.metastore.read(get_state_ir_cache_name(state))
+        except (FileNotFoundError, OSError):
+            continue
+        try:
+            ir_data = json.loads(ir_json)
+        except json.JSONDecodeError:
+            continue
+        return [(path, "") for path in ir_data.get("src_hashes", {})]
+    return []
+
+
 def get_ir_cache_name(id: str, path: str, options: Options) -> str:
     meta_path, _, _ = get_cache_names(id, path, options)
     # Mypyc uses JSON cache even with --fixed-format-cache (for now).
@@ -615,16 +646,19 @@
 
         base_emitter = Emitter(self.context)
         # Optionally just include the runtime library c files to
-        # reduce the number of compiler invocations needed
+        # reduce the number of compiler invocations needed.
+        # Use <> form (only -I paths) so a shim file with the same
+        # basename as a runtime file can't shadow it. Triggered by
+        # mypyc/lower/int_ops.py vs lib-rt/int_ops.c on mypy self-compile.
         if self.compiler_options.include_runtime_files:
             for name in RUNTIME_C_FILES:
-                base_emitter.emit_line(f'#include "{name}"')
+                base_emitter.emit_line(f"#include <{name}>")
             # Include conditional source files
             source_deps = collect_source_dependencies(self.modules)
             for source_dep in sorted(source_deps, key=lambda d: d.path):
-                base_emitter.emit_line(f'#include "{source_dep.path}"')
+                base_emitter.emit_line(f"#include <{source_dep.path}>")
             if self.compiler_options.depends_on_librt_internal:
-                base_emitter.emit_line('#include "internal/librt_internal_api.c"')
+                base_emitter.emit_line("#include <internal/librt_internal_api.c>")
         base_emitter.emit_line(f'#include "__native{self.short_group_suffix}.h"')
         base_emitter.emit_line(f'#include "__native_internal{self.short_group_suffix}.h"')
         emitter = base_emitter
diff --git a/mypyc/irbuild/prepare.py b/mypyc/irbuild/prepare.py
index 09bfc83..f143ce1 100644
--- a/mypyc/irbuild/prepare.py
+++ b/mypyc/irbuild/prepare.py
@@ -182,7 +182,12 @@
                     continue
                 mapper.type_to_ir[node.node] = ir
                 mapper.symbol_fullnames.add(node.node.fullname)
-                mapper.func_to_decl[node.node] = ir.ctor
+                # Trait/builtin-base classes have an ir.ctor FuncDecl
+                # but no emitted CPyDef_<ctor>, so a cross-group direct
+                # call would hit an undefined symbol. Mirror the skip
+                # in prepare_init_method.
+                if not ir.is_trait and not ir.builtin_base:
+                    mapper.func_to_decl[node.node] = ir.ctor
 
     for module in modules:
         for func in get_module_func_defs(module):
diff --git a/mypyc/lib-rt/misc_ops.c b/mypyc/lib-rt/misc_ops.c
index 2aaadb2..392dba0 100644
--- a/mypyc/lib-rt/misc_ops.c
+++ b/mypyc/lib-rt/misc_ops.c
@@ -1281,12 +1281,17 @@
         Py_DECREF(file);
         return 0;
     }
-    // Derive __file__ from the shared library's __file__ (for its
-    // directory), the module name (dots -> path separators), and the
-    // extension suffix.  E.g. for module "a.b.c", shared lib
-    // "/path/to/group__mypyc.cpython-312-x86_64-linux-gnu.so",
-    // suffix ".cpython-312-x86_64-linux-gnu.so":
-    //   => "/path/to/a/b/c.cpython-312-x86_64-linux-gnu.so"
+    // Derive __file__ from the shared lib's directory, the module
+    // name, and the extension suffix. Two layouts:
+    //
+    //  Monolithic: one shared lib above the package tree holds many
+    //    modules, so append the full dotted module path.
+    //  separate=True: each module has its own "<segment>__mypyc.so"
+    //    next to the module, so dirname(shared_lib) is already inside
+    //    the parent package. Append only the last segment.
+    //
+    // Detect the separate=True case by matching the shared lib's
+    // basename against "<last_segment>__mypyc<ext>".
     PyObject *derived_file = NULL;
     if (shared_lib_file != NULL && shared_lib_file != Py_None &&
             PyUnicode_Check(shared_lib_file)) {
@@ -1314,30 +1319,65 @@
         if (module_path == NULL) {
             return -1;
         }
+
+        // Compute the module's last dotted segment for the separate=True check.
+        Py_ssize_t name_len = PyUnicode_GetLength(module_name);
+        Py_ssize_t last_dot = PyUnicode_FindChar(module_name, '.', 0, name_len, -1);
+        PyObject *last_segment;
+        if (last_dot >= 0) {
+            last_segment = PyUnicode_Substring(module_name, last_dot + 1, name_len);
+        } else {
+            last_segment = module_name;
+            Py_INCREF(last_segment);
+        }
+        if (last_segment == NULL) {
+            Py_DECREF(module_path);
+            return -1;
+        }
+        // Compare shared_lib_file basename against "<last_segment>__mypyc<ext>".
+        PyObject *expected_basename = PyUnicode_FromFormat(
+            "%U__mypyc%U", last_segment, ext_suffix);
+        PyObject *actual_basename;
+        if (sep >= 0) {
+            actual_basename = PyUnicode_Substring(shared_lib_file, sep + 1, sf_len);
+        } else {
+            actual_basename = shared_lib_file;
+            Py_INCREF(actual_basename);
+        }
+        int is_per_module_lib = 0;
+        if (expected_basename != NULL && actual_basename != NULL) {
+            is_per_module_lib =
+                (PyUnicode_Compare(expected_basename, actual_basename) == 0);
+        }
+        Py_XDECREF(expected_basename);
+        Py_XDECREF(actual_basename);
+
         // For packages, __file__ should point to __init__<ext>,
         // e.g. "a/b/__init__.cpython-312-x86_64-linux-gnu.so".
+        PyObject *file_path = is_per_module_lib ? last_segment : module_path;
         if (sep >= 0) {
             PyObject *dir = PyUnicode_Substring(shared_lib_file, 0, sep);
             if (dir != NULL) {
                 if (is_package) {
                     derived_file = PyUnicode_FromFormat(
                         "%U%c%U%c__init__%U", dir, (int)sep_char,
-                        module_path, (int)sep_char, ext_suffix);
+                        file_path, (int)sep_char, ext_suffix);
                 } else {
                     derived_file = PyUnicode_FromFormat(
                         "%U%c%U%U", dir, (int)sep_char,
-                        module_path, ext_suffix);
+                        file_path, ext_suffix);
                 }
                 Py_DECREF(dir);
             }
         } else {
             if (is_package) {
                 derived_file = PyUnicode_FromFormat(
-                    "%U%c__init__%U", module_path, (int)SEP[0], ext_suffix);
+                    "%U%c__init__%U", file_path, (int)SEP[0], ext_suffix);
             } else {
-                derived_file = PyUnicode_FromFormat("%U%U", module_path, ext_suffix);
+                derived_file = PyUnicode_FromFormat("%U%U", file_path, ext_suffix);
             }
         }
+        Py_DECREF(last_segment);
         Py_DECREF(module_path);
     }
     if (derived_file == NULL && !PyErr_Occurred()) {
diff --git a/setup.py b/setup.py
index d36a6bf..1879f68 100644
--- a/setup.py
+++ b/setup.py
@@ -153,6 +153,7 @@
     debug_level = os.getenv("MYPYC_DEBUG_LEVEL", "1")
     force_multifile = os.getenv("MYPYC_MULTI_FILE", "") == "1"
     log_trace = bool(int(os.getenv("MYPYC_LOG_TRACE", "0")))
+    separate = os.getenv("MYPYC_SEPARATE", "") == "1"
     ext_modules = mypycify(
         mypyc_targets + ["--config-file=mypy_bootstrap.ini"],
         opt_level=opt_level,
@@ -161,6 +162,7 @@
         # our Appveyor builds run out of memory sometimes.
         multi_file=sys.platform == "win32" or force_multifile,
         log_trace=log_trace,
+        separate=separate,
         # Mypy itself is allowed to use native_internal extension.
         depends_on_librt_internal=True,
     )