Merge pull request #481 from rjmansfield/macho-archs-support

Add -d archs data source for Mach-O universal binaries
diff --git a/src/bloaty.cc b/src/bloaty.cc
index e772988..5232959 100644
--- a/src/bloaty.cc
+++ b/src/bloaty.cc
@@ -85,6 +85,7 @@
 
 constexpr DataSourceDefinition data_sources[] = {
     {DataSource::kArchiveMembers, "armembers", "the .o files in a .a file"},
+    {DataSource::kArchs, "archs", "architecture slices in universal binaries"},
     {DataSource::kCompileUnits, "compileunits",
      "source file for the .o file (translation unit). requires debug info."},
     {DataSource::kInputFiles, "inputfiles",
diff --git a/src/bloaty.h b/src/bloaty.h
index 85515fc..c9e2d1f 100644
--- a/src/bloaty.h
+++ b/src/bloaty.h
@@ -58,6 +58,7 @@
   kRawRanges,
   kSections,
   kSegments,
+  kArchs,
 
   // We always set this to one of the concrete symbol types below before
   // setting it on a sink.
diff --git a/src/elf.cc b/src/elf.cc
index ac0f610..de2df46 100644
--- a/src/elf.cc
+++ b/src/elf.cc
@@ -1401,6 +1401,8 @@
           DoReadELFSections(sink, kReportByEscapedSectionName);
           break;
         }
+        case DataSource::kArchs:
+          THROW("ELF files do not support 'archs' data source");
         default:
           THROW("unknown data source");
       }
diff --git a/src/macho.cc b/src/macho.cc
index 3f849ac..5b8ca82 100644
--- a/src/macho.cc
+++ b/src/macho.cc
@@ -21,7 +21,9 @@
 #include <string_view>
 
 #include "absl/strings/str_join.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/substitute.h"
+#include "third_party/darwin_xnu_macho/mach/machine.h"
 #include "third_party/darwin_xnu_macho/mach-o/loader.h"
 #include "third_party/darwin_xnu_macho/mach-o/fat.h"
 #include "third_party/darwin_xnu_macho/mach-o/nlist.h"
@@ -69,6 +71,57 @@
   }
 }
 
+// ARM64E capability field constants
+static constexpr uint32_t ARM64E_SUBTYPE_MASK = 0x00FFFFFF;  // Low 24 bits: subtype proper
+
+static bool IsArm64eSubtype(uint32_t cpusubtype) {
+  uint32_t subtype_proper = cpusubtype & ARM64E_SUBTYPE_MASK;
+  return subtype_proper == CPU_SUBTYPE_ARM64E;
+}
+
+std::string CpuTypeToString(uint32_t cputype, uint32_t cpusubtype) {
+  switch (cputype) {
+    case CPU_TYPE_X86_64:
+      switch (cpusubtype) {
+        case CPU_SUBTYPE_X86_64_H:
+          return "x86_64h";
+        default:
+          return "x86_64";
+      }
+    case CPU_TYPE_ARM64:
+      if (IsArm64eSubtype(cpusubtype)) {
+        return "arm64e";
+      }
+      switch (cpusubtype) {
+        case CPU_SUBTYPE_ARM64_V8:
+          return "arm64v8";
+        default:
+          return "arm64";
+      }
+    case CPU_TYPE_X86:
+      return "i386";
+    case CPU_TYPE_ARM:
+      switch (cpusubtype) {
+        case CPU_SUBTYPE_ARM_V6:
+          return "armv6";
+        case CPU_SUBTYPE_ARM_V7:
+          return "armv7";
+        case CPU_SUBTYPE_ARM_V7F:
+          return "armv7f";
+        case CPU_SUBTYPE_ARM_V7S:
+          return "armv7s";
+        case CPU_SUBTYPE_ARM_V7K:
+          return "armv7k";
+        case CPU_SUBTYPE_ARM_V8:
+          return "armv8";
+        default:
+          return "arm";
+      }
+    default:
+      return absl::StrFormat("cpu_%d", cputype);
+  }
+}
+
 struct LoadCommand {
   bool is64bit;
   uint32_t cmd;
@@ -652,6 +705,10 @@
           ReadDWARFInlines(dwarf, sink, true);
           break;
         }
+        case DataSource::kArchs: {
+          ProcessArchitectures(sink);
+          break;
+        }
         case DataSource::kArchiveMembers:
         default:
           THROW("Mach-O doesn't support this data source");
@@ -660,6 +717,34 @@
     }
   }
 
+  void ProcessArchitectures(RangeSink* sink) const {
+    uint32_t magic = ReadMagic(file_data().data());
+
+    if (magic == FAT_CIGAM) {
+      string_view header_data = file_data().data();
+      auto header = GetStructPointerAndAdvance<fat_header>(&header_data);
+      uint32_t nfat_arch = ByteSwap(header->nfat_arch);
+
+      for (uint32_t i = 0; i < nfat_arch; i++) {
+        auto arch = GetStructPointerAndAdvance<fat_arch>(&header_data);
+        uint32_t cputype = ByteSwap(arch->cputype);
+        uint32_t cpusubtype = ByteSwap(arch->cpusubtype);
+        uint32_t offset = ByteSwap(arch->offset);
+        uint32_t size = ByteSwap(arch->size);
+
+        std::string arch_name = CpuTypeToString(cputype, cpusubtype);
+        string_view slice_data = StrictSubstr(file_data().data(), offset, size);
+
+        sink->AddFileRange("archs", arch_name, slice_data);
+      }
+    } else {
+      auto header = GetStructPointer<mach_header>(file_data().data());
+      std::string arch_name = CpuTypeToString(header->cputype, header->cpusubtype);
+
+      sink->AddFileRange("archs", arch_name, file_data().data());
+    }
+  }
+
   bool GetDisassemblyInfo(std::string_view /*symbol*/,
                           DataSource /*symbol_source*/,
                           DisassemblyInfo* /*info*/) const override {
diff --git a/tests/macho/archs.test b/tests/macho/archs.test
new file mode 100644
index 0000000..a8374b4
--- /dev/null
+++ b/tests/macho/archs.test
@@ -0,0 +1,263 @@
+# Test -d archs data source for mach-o universal binaries
+#
+# Tests that the 'archs' data source correctly reports architecture slices
+# in universal binaries and single-architecture binaries.
+
+## Test 1: Universal binary with two architectures (x86_64 and arm64)
+# RUN: %yaml2obj --docnum=1 %s -o %t.universal
+# RUN: %bloaty %t.universal -d archs --domain=file | %FileCheck --check-prefix=UNIVERSAL %s
+
+# UNIVERSAL: FILE SIZE
+# UNIVERSAL-DAG: x86_64
+# UNIVERSAL-DAG: arm64
+# UNIVERSAL-DAG: [Unmapped]
+
+## Test 2: Filter to x86_64 architecture only
+# RUN: %bloaty %t.universal -d archs,segments --source-filter=x86_64 --domain=file | %FileCheck --check-prefix=FILTER-X86 %s
+
+# FILTER-X86: FILE SIZE
+# FILTER-X86: x86_64
+# FILTER-X86: __TEXT
+# FILTER-X86: __LINKEDIT
+# FILTER-X86-NOT: arm64
+
+## Test 3: Filter to arm64 architecture only
+# RUN: %bloaty %t.universal -d archs,segments --source-filter=arm64 --domain=file | %FileCheck --check-prefix=FILTER-ARM %s
+
+# FILTER-ARM: FILE SIZE
+# FILTER-ARM: arm64
+# FILTER-ARM: __TEXT
+# FILTER-ARM: __LINKEDIT
+# FILTER-ARM-NOT: x86_64
+
+## Test 4: Single architecture binary
+# RUN: %yaml2obj --docnum=2 %s -o %t.single
+# RUN: %bloaty %t.single -d archs --domain=file | %FileCheck --check-prefix=SINGLE %s
+
+# SINGLE: FILE SIZE
+# SINGLE: x86_64
+# SINGLE-NOT: arm64
+
+## Universal binary with x86_64 and arm64 slices
+--- !fat-mach-o
+FatHeader:
+  magic:           0xCAFEBABE
+  nfat_arch:       2
+FatArchs:
+  - cputype:         0x1000007
+    cpusubtype:      0x3
+    offset:          0x1000
+    size:            4176
+    align:           12
+  - cputype:         0x100000C
+    cpusubtype:      0x0
+    offset:          0x2050
+    size:            8280
+    align:           12
+Slices:
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x1000007
+      cpusubtype:      0x3
+      filetype:        0x2
+      ncmds:           3
+      sizeofcmds:      328
+      flags:           0x200085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __PAGEZERO
+        vmaddr:          0
+        vmsize:          4294967296
+        fileoff:         0
+        filesize:        0
+        maxprot:         0
+        initprot:        0
+        nsects:          0
+        flags:           0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __TEXT
+        vmaddr:          4294967296
+        vmsize:          4096
+        fileoff:         0
+        filesize:        4096
+        maxprot:         5
+        initprot:        5
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x100000F80
+            size:            8
+            offset:          0xF80
+            align:           4
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         554889E531C05DC3
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          4294971392
+        vmsize:          4096
+        fileoff:         4096
+        filesize:        80
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+    LinkEditData:
+      NameList:
+        - n_strx:          1
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         4294971264
+      StringTable:
+        - ' '
+        - _main
+  - !mach-o
+    FileHeader:
+      magic:           0xFEEDFACF
+      cputype:         0x100000C
+      cpusubtype:      0x0
+      filetype:        0x2
+      ncmds:           3
+      sizeofcmds:      328
+      flags:           0x200085
+      reserved:        0x0
+    LoadCommands:
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __PAGEZERO
+        vmaddr:          0
+        vmsize:          4294967296
+        fileoff:         0
+        filesize:        0
+        maxprot:         0
+        initprot:        0
+        nsects:          0
+        flags:           0
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         152
+        segname:         __TEXT
+        vmaddr:          4294967296
+        vmsize:          8192
+        fileoff:         0
+        filesize:        8192
+        maxprot:         5
+        initprot:        5
+        nsects:          1
+        flags:           0
+        Sections:
+          - sectname:        __text
+            segname:         __TEXT
+            addr:            0x100001F80
+            size:            8
+            offset:          0x1F80
+            align:           2
+            reloff:          0x0
+            nreloc:          0
+            flags:           0x80000400
+            reserved1:       0x0
+            reserved2:       0x0
+            reserved3:       0x0
+            content:         00008052C0035FD6
+      - cmd:             LC_SEGMENT_64
+        cmdsize:         72
+        segname:         __LINKEDIT
+        vmaddr:          4294975488
+        vmsize:          4096
+        fileoff:         8192
+        filesize:        88
+        maxprot:         1
+        initprot:        1
+        nsects:          0
+        flags:           0
+    LinkEditData:
+      NameList:
+        - n_strx:          1
+          n_type:          0xF
+          n_sect:          1
+          n_desc:          0
+          n_value:         4294975360
+      StringTable:
+        - ' '
+        - _main
+
+## Single x86_64 Mach-O executable
+--- !mach-o
+FileHeader:
+  magic:           0xFEEDFACF
+  cputype:         0x1000007
+  cpusubtype:      0x3
+  filetype:        0x2
+  ncmds:           3
+  sizeofcmds:      328
+  flags:           0x200085
+  reserved:        0x0
+LoadCommands:
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         152
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        4096
+    maxprot:         5
+    initprot:        5
+    nsects:          1
+    flags:           0
+    Sections:
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x100000F80
+        size:            8
+        offset:          0xF80
+        align:           4
+        reloff:          0x0
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x0
+        reserved2:       0x0
+        reserved3:       0x0
+        content:         554889E531C05DC3
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        80
+    maxprot:         1
+    initprot:        1
+    nsects:          0
+    flags:           0
+LinkEditData:
+  NameList:
+    - n_strx:          1
+      n_type:          0xF
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971264
+  StringTable:
+    - ' '
+    - _main