Merge pull request #136 from haberman/macho

Enabled "-d compileunits" for Mach-O.
diff --git a/README.md b/README.md
index 63951a9..6d48013 100644
--- a/README.md
+++ b/README.md
@@ -329,10 +329,14 @@
 so no special configuration is needed to make sure you get
 them.
 
-TODO: Mach-O puts debug info in separate files, which are
-created using `dsymutil`.  DWARF is not yet supported for
-Mach-O, but once it is then `--debug-file` will be necessary
-to help Bloaty find these separate debug files also.
+Mach-O puts debug information in separate files which you
+can create with `dsymutil`:
+
+```
+$ dsymutil bloaty
+$ strip bloaty  (optional)
+$ ./bloaty -d symbols --debug-file=bloaty.dSYM/Contents/Resources/DWARF/bloaty bloaty
+```
 
 # Configuration Files
 
diff --git a/src/bloaty.cc b/src/bloaty.cc
index 3ec0a88..0f09756 100644
--- a/src/bloaty.cc
+++ b/src/bloaty.cc
@@ -98,6 +98,7 @@
      "raw ranges of previous data source."},
     {DataSource::kSections, "sections", "object file section"},
     {DataSource::kSegments, "segments", "load commands in the binary"},
+    // We require that all symbols sources are >= kSymbols.
     {DataSource::kSymbols, "symbols",
      "symbols from symbol table (configure demangling with --demangle)"},
     {DataSource::kRawSymbols, "rawsymbols", "unmangled symbols"},
@@ -817,6 +818,8 @@
 
 // RangeMap ////////////////////////////////////////////////////////////////////
 
+constexpr uint64_t RangeSink::kUnknownSize;
+
 
 // MmapInputFile ///////////////////////////////////////////////////////////////
 
diff --git a/src/bloaty.h b/src/bloaty.h
index d37e9a0..8e1cbce 100644
--- a/src/bloaty.h
+++ b/src/bloaty.h
@@ -220,7 +220,7 @@
   uint64_t TranslateFileToVM(const char* ptr);
   absl::string_view TranslateVMToFile(uint64_t address);
 
-  static const uint64_t kUnknownSize = RangeMap::kUnknownSize;
+  static constexpr uint64_t kUnknownSize = RangeMap::kUnknownSize;
 
  private:
   BLOATY_DISALLOW_COPY_AND_ASSIGN(RangeSink);
diff --git a/src/dwarf.cc b/src/dwarf.cc
index f795e67..20fad8d 100644
--- a/src/dwarf.cc
+++ b/src/dwarf.cc
@@ -183,6 +183,9 @@
   // The size of addresses.  Guaranteed to be either 4 or 8.
   uint8_t address_size() const { return address_size_; }
 
+  // DWARF version of this unit.
+  uint8_t dwarf_version() const { return dwarf_version_; }
+
   void SetAddressSize(uint8_t address_size) {
     if (address_size != 4 && address_size != 8) {
       THROWF("Unexpected address size: $0", address_size);
@@ -243,7 +246,12 @@
     return unit;
   }
 
+  void ReadDWARFVersion(string_view* data) {
+    dwarf_version_ = ReadMemcpy<uint16_t>(data);
+  }
+
  private:
+  uint16_t dwarf_version_;
   bool dwarf64_;
   uint8_t address_size_;
 };
@@ -422,9 +430,9 @@
   }
 
   unit_remaining_ = sizes_.ReadInitialLength(&next_unit_);
-  uint16_t version = ReadMemcpy<uint16_t>(&unit_remaining_);
+  sizes_.ReadDWARFVersion(&unit_remaining_);
 
-  if (version > 2) {
+  if (sizes_.dwarf_version() > 2) {
     THROW("DWARF data is too new for us");
   }
 
@@ -686,11 +694,13 @@
 
 bool DIEReader::ReadCode() {
   uint32_t code;
+  size_t offset;
 again:
   if (remaining_.empty()) {
     state_ = State::kEof;
     return false;
   }
+  offset = remaining_.data() - unit_range_.data();
   code = ReadLEB128<uint32_t>(&remaining_);
   if (code == 0) {
     // null entry terminates a chain of sibling entries.
@@ -748,9 +758,9 @@
   unit_range_ = unit_range_.substr(
       0, remaining_.size() + (remaining_.data() - unit_range_.data()));
 
-  uint16_t version = ReadMemcpy<uint16_t>(&remaining_);
+  unit_sizes_.ReadDWARFVersion(&remaining_);
 
-  if (version > 4) {
+  if (unit_sizes_.dwarf_version() > 4) {
     THROW("Data is in new DWARF format we don't understand");
   }
 
@@ -874,7 +884,7 @@
     case DW_FORM_ref8:
       return AttrValue(ReadMemcpy<uint64_t>(data));
     case DW_FORM_addr:
-    case DW_FORM_ref_addr:
+    address_size:
       switch (reader.unit_sizes().address_size()) {
         case 4:
           return AttrValue(ReadMemcpy<uint32_t>(data));
@@ -883,6 +893,10 @@
         default:
           BLOATY_UNREACHABLE();
       }
+    case DW_FORM_ref_addr:
+      if (reader.unit_sizes().dwarf_version() <= 2) {
+        goto address_size;
+      }
     case DW_FORM_sec_offset:
       if (reader.unit_sizes().dwarf64()) {
         return AttrValue(ReadMemcpy<uint64_t>(data));
@@ -1108,13 +1122,13 @@
 
   sizes_.SetAddressSize(address_size);
   data = sizes_.ReadInitialLength(&data);
-  uint16_t version = ReadMemcpy<uint16_t>(&data);
+  sizes_.ReadDWARFVersion(&data);
   uint64_t header_length = sizes_.ReadDWARFOffset(&data);
   string_view program = data;
   SkipBytes(header_length, &program);
 
   params_.minimum_instruction_length = ReadMemcpy<uint8_t>(&data);
-  if (version == 4) {
+  if (sizes_.dwarf_version() == 4) {
     params_.maximum_operations_per_instruction = ReadMemcpy<uint8_t>(&data);
 
     if (params_.maximum_operations_per_instruction == 0) {
@@ -1630,7 +1644,7 @@
     string_view unit = sizes.ReadInitialLength(&remaining);
     full_unit =
         full_unit.substr(0, unit.size() + (unit.data() - full_unit.data()));
-    dwarf::SkipBytes(2, &unit);
+    sizes.ReadDWARFVersion(&unit);
     uint64_t debug_info_offset = sizes.ReadDWARFOffset(&unit);
     bool ok = die_reader.SeekToCompilationUnit(
         dwarf::DIEReader::Section::kDebugInfo, debug_info_offset);
diff --git a/src/macho.cc b/src/macho.cc
index e318c90..3ba5239 100644
--- a/src/macho.cc
+++ b/src/macho.cc
@@ -427,7 +427,8 @@
 }
 
 template <class NList>
-void ParseSymbolsFromSymbolTable(const LoadCommand& cmd, RangeSink* sink) {
+void ParseSymbolsFromSymbolTable(const LoadCommand& cmd, SymbolTable* table,
+                                 RangeSink* sink) {
   auto symtab_cmd = GetStructPointer<symtab_command>(cmd.command_data);
 
   string_view symtab = StrictSubstr(cmd.file_data, symtab_cmd->symoff,
@@ -438,27 +439,41 @@
   uint32_t nsyms = symtab_cmd->nsyms;
   for (uint32_t i = 0; i < nsyms; i++) {
     auto sym = GetStructPointerAndAdvance<NList>(&symtab);
+    string_view sym_range(reinterpret_cast<const char*>(sym), sizeof(NList));
 
     if (sym->n_type & N_STAB || sym->n_value == 0) {
       continue;
     }
 
     string_view name = ReadNullTerminated(strtab.substr(sym->n_un.n_strx));
-    sink->AddVMRange("macho_symbols", sym->n_value, RangeSink::kUnknownSize,
-                     ItaniumDemangle(name, sink->data_source()));
+
+    if (sink->data_source() >= DataSource::kSymbols) {
+      sink->AddVMRange("macho_symbols", sym->n_value, RangeSink::kUnknownSize,
+                       ItaniumDemangle(name, sink->data_source()));
+    }
+
+    if (table) {
+      table->insert(std::make_pair(
+          name, std::make_pair(sym->n_value, RangeSink::kUnknownSize)));
+    }
+
+    // Capture the trailing NULL.
+    name = string_view(name.data(), name.size() + 1);
+    sink->AddFileRangeFor("macho_symtab_name", sym->n_value, name);
+    sink->AddFileRangeFor("macho_symtab_sym", sym->n_value, sym_range);
   }
 }
 
-void ParseSymbols(string_view file_data, RangeSink* sink) {
+void ParseSymbols(string_view file_data, SymbolTable* symtab, RangeSink* sink) {
   ForEachLoadCommand(
       file_data, sink,
-      [sink](const LoadCommand& cmd) {
+      [symtab, sink](const LoadCommand& cmd) {
         switch (cmd.cmd) {
           case LC_SYMTAB:
             if (cmd.is64bit) {
-              ParseSymbolsFromSymbolTable<nlist_64>(cmd, sink);
+              ParseSymbolsFromSymbolTable<nlist_64>(cmd, symtab, sink);
             } else {
-              ParseSymbolsFromSymbolTable<struct nlist>(cmd, sink);
+              ParseSymbolsFromSymbolTable<struct nlist>(cmd, symtab, sink);
             }
             break;
           case LC_DYSYMTAB:
@@ -486,6 +501,78 @@
   sink->AddFileRange("macho_fallback", "[Unmapped]", sink->input_file().data());
 }
 
+template <class Segment, class Section>
+void ReadDebugSectionsFromSegment(LoadCommand cmd, dwarf::File* dwarf) {
+  auto segment = GetStructPointerAndAdvance<Segment>(&cmd.command_data);
+
+  if (segment->maxprot == VM_PROT_NONE) {
+    return;
+  }
+
+  string_view segname = ArrayToStr(segment->segname, 16);
+
+  if (segname != "__DWARF") {
+    return;
+  }
+
+  uint32_t nsects = segment->nsects;
+  for (uint32_t j = 0; j < nsects; j++) {
+    auto section = GetStructPointerAndAdvance<Section>(&cmd.command_data);
+    string_view sectname = ArrayToStr(section->sectname, 16);
+
+    // filesize equals vmsize unless the section is zerofill
+    uint64_t filesize = section->size;
+    switch (section->flags & SECTION_TYPE) {
+      case S_ZEROFILL:
+      case S_GB_ZEROFILL:
+      case S_THREAD_LOCAL_ZEROFILL:
+        filesize = 0;
+        break;
+      default:
+        break;
+    }
+
+    string_view contents =
+        StrictSubstr(cmd.file_data, section->offset, filesize);
+
+    if (sectname == "__debug_aranges") {
+      dwarf->debug_aranges = contents;
+    } else if (sectname == "__debug_str") {
+      dwarf->debug_str = contents;
+    } else if (sectname == "__debug_info") {
+      dwarf->debug_info = contents;
+    } else if (sectname == "__debug_types") {
+      dwarf->debug_types = contents;
+    } else if (sectname == "__debug_abbrev") {
+      dwarf->debug_abbrev = contents;
+    } else if (sectname == "__debug_line") {
+      dwarf->debug_line = contents;
+    } else if (sectname == "__debug_loc") {
+      dwarf->debug_loc = contents;
+    } else if (sectname == "__debug_pubnames") {
+      dwarf->debug_pubnames = contents;
+    } else if (sectname == "__debug_pubtypes") {
+      dwarf->debug_pubtypes = contents;
+    } else if (sectname == "__debug_ranges") {
+      dwarf->debug_ranges = contents;
+    }
+  }
+}
+
+static void ReadDebugSectionsFromMachO(const InputFile& file, dwarf::File* dwarf) {
+  ForEachLoadCommand(file.data(), nullptr, [dwarf](const LoadCommand& cmd) {
+    switch (cmd.cmd) {
+      case LC_SEGMENT_64:
+        ReadDebugSectionsFromSegment<segment_command_64, section_64>(cmd,
+                                                                     dwarf);
+        break;
+      case LC_SEGMENT:
+        ReadDebugSectionsFromSegment<segment_command, section>(cmd, dwarf);
+        break;
+    }
+  });
+}
+
 class MachOObjectFile : public ObjectFile {
  public:
   MachOObjectFile(std::unique_ptr<InputFile> file_data)
@@ -520,10 +607,25 @@
         case DataSource::kRawSymbols:
         case DataSource::kShortSymbols:
         case DataSource::kFullSymbols:
-          ParseSymbols(debug_file().file_data().data(), sink);
+          ParseSymbols(debug_file().file_data().data(), nullptr, sink);
           break;
+        case DataSource::kCompileUnits: {
+          SymbolTable symtab;
+          DualMap symbol_map;
+          NameMunger empty_munger;
+          RangeSink symbol_sink(&debug_file().file_data(),
+                                sink->options(),
+                                DataSource::kRawSymbols,
+                                &sinks[0]->MapAtIndex(0));
+          symbol_sink.AddOutput(&symbol_map, &empty_munger);
+          ParseSymbols(debug_file().file_data().data(), &symtab, &symbol_sink);
+          dwarf::File dwarf;
+          ReadDebugSectionsFromMachO(debug_file().file_data(), &dwarf);
+          ReadDWARFCompileUnits(dwarf, symtab, symbol_map, sink);
+          ParseSymbols(sink->input_file().data(), nullptr, sink);
+          break;
+        }
         case DataSource::kArchiveMembers:
-        case DataSource::kCompileUnits:
         case DataSource::kInlines:
         default:
           THROW("Mach-O doesn't support this data source");
diff --git a/src/range_map.cc b/src/range_map.cc
index 5c3658e..39184b2 100644
--- a/src/range_map.cc
+++ b/src/range_map.cc
@@ -18,6 +18,8 @@
 
 namespace bloaty {
 
+constexpr uint64_t RangeMap::kUnknownSize;
+
 template <class T>
 uint64_t RangeMap::TranslateWithEntry(T iter, uint64_t addr) const {
   assert(EntryContains(iter, addr));
diff --git a/src/range_map.h b/src/range_map.h
index d16aea0..d2c62b2 100644
--- a/src/range_map.h
+++ b/src/range_map.h
@@ -136,7 +136,7 @@
     }
   }
 
-  static const uint64_t kUnknownSize = UINT64_MAX;
+  static constexpr uint64_t kUnknownSize = UINT64_MAX;
 
  private:
   friend class RangeMapTest;