blob: 49f675bf9a5570ad8fc8afdcfc317d6a1c6a5314 [file] [log] [blame]
// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "garnet/bin/zxdb/symbols/module_symbol_index.h"
#include <limits>
#include "garnet/bin/zxdb/common/file_util.h"
#include "garnet/bin/zxdb/common/string_util.h"
#include "garnet/bin/zxdb/symbols/dwarf_die_decoder.h"
#include "garnet/bin/zxdb/symbols/module_symbol_index_node.h"
#include "garnet/public/lib/fxl/logging.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
namespace zxdb {
namespace {
// We want to index the things that may need to be referenced globally: global
// variables, file and class static variables, and function implementations.
//
// Indexable functions are the DW_TAG_subprogram entries that have a range of
// code. These implementations won't always have the full type information,
// when the declaration is separate from the implementation, the implementation
// will reference the separate declaration node. The declaration of the
// function will contain the name and have the proper nesting inside classes
// and namespaces, etc. according to the structure of the original code.
//
// Variables work similarly. A global variable will often have a separate
// declaration (in the proper namespaces) and storage (often outside of
// namespaces), but file-level statics with the declaration and storage
// declared all-in-one will have one entry representing everything.
//
// In a compile unit (basically one object file), there will likely be lots of
// declarations from all the headers, and a smaller number of actual function
// definitions and variable storage.
//
// From a high level, we want to search the DIEs for the implementations and
// variable storage which is the stuff that will need to be referenced from
// the global context in the debugger.
//
// Then we follow the link to their definition (if separate from the
// implementation), then walk up the tree to get the full class and namespacing
// information. But walking the tree upwards requires lots of linear searching
// since the tree is stored in a flat array.
//
// To index efficiently, do two passes:
// 1. Walk linearly through all DIEs:
// 1a. Find the ones we're interested in and save the information.
// 1b. For each one, save the index of the parent so we can efficiently
// walk up the tree in pass 2.
// 2. Resolve the full type information for each function:
// 2a. Find the declaration for each function implementation DIE.
// 2b. Walk that declaration up to get the full context.
// 2c. Index that.
//
// Performance note: Having the unit extract its DIEs via DWARFUnit::dies() and
// DWARFUnit::getNumDIEs() basically iterates through the whole table, which
// we then do again here. We can probably speed things up by eliminating this
// call, calling unit.getUnitDIE(), and manually iterating the children of
// that.
// The SymbolStorage stores the information from the "implementation" of a
// symbol (a function DIE that has code or a variable that has a location),
// representing something we want to index. The entry will always refer to the
// DIE for the implementation, and the offset will refer to the offset of the
// DIE for the definition.
//
// Some functions and variables have separate definitions, and some don't. If
// the definition and implementation is the same, the offset will just point to
// the entry.
struct SymbolStorage {
SymbolStorage(const llvm::DWARFDebugInfoEntry* e, uint64_t offs)
: entry(e), definition_unit_offset(offs) {}
const llvm::DWARFDebugInfoEntry* entry;
uint64_t definition_unit_offset;
};
// Index used to indicate there is no parent.
constexpr unsigned kNoParent = std::numeric_limits<unsigned>::max();
// Returns true if the given abbreviation defines a PC range.
bool AbbrevHasCode(const llvm::DWARFAbbreviationDeclaration* abbrev) {
for (const auto spec : abbrev->attributes()) {
if (spec.Attr == llvm::dwarf::DW_AT_low_pc ||
spec.Attr == llvm::dwarf::DW_AT_high_pc)
return true;
}
return false;
}
// Returns true if the given abbreviation defines a "location".
bool AbbrevHasLocation(const llvm::DWARFAbbreviationDeclaration* abbrev) {
for (const auto spec : abbrev->attributes()) {
if (spec.Attr == llvm::dwarf::DW_AT_location)
return true;
}
return false;
}
size_t RecursiveCountDies(const ModuleSymbolIndexNode& node) {
size_t result = node.dies().size();
for (const auto& pair : node.sub())
result += RecursiveCountDies(pair.second);
return result;
}
// Step 1 of the algorithm above. Fills the symbol_storage array with the
// information for all function implementations (ones with addresses). Fills
// the parent_indices array with the index of the parent of each DIE in the
// unit (it will be exactly unit->getNumDIEs() long). The root node will have
// kNoParent set.
void ExtractUnitIndexableEntries(llvm::DWARFContext* context,
llvm::DWARFUnit* unit,
std::vector<SymbolStorage>* symbol_storage,
std::vector<unsigned>* parent_indices) {
DwarfDieDecoder decoder(context, unit);
// The offset of the declaration. This can be unit-relative or file-absolute.
// This code doesn't implement the file-absolute variant which it seems our
// toolchain doesn't generate. To implement I'm thinking everything
// with an absolute offset will be put into a global list and processed in a
// third pass once all units are processed. This third pass will be slower
// since probably we won't do any optimized lookups.
llvm::Optional<uint64_t> decl_unit_offset;
llvm::Optional<uint64_t> decl_global_offset;
decoder.AddReference(llvm::dwarf::DW_AT_specification, &decl_unit_offset,
&decl_global_offset);
// Stores the index of the parent DIE for each one we encounter. The root
// DIE with no parent will be set to kNoParent.
unsigned die_count = unit->getNumDIEs();
parent_indices->resize(die_count);
// Stores the list of parent indices according to the current depth in the
// tree. At any given point, the parent index of the current node will be
// tree_stack.back(). inside_function should be set if this node or any
// parent node is a function.
struct StackEntry {
StackEntry(int d, unsigned i, bool f)
: depth(d), index(i), inside_function(f) {}
int depth;
unsigned index;
bool inside_function;
};
std::vector<StackEntry> tree_stack;
tree_stack.reserve(8);
tree_stack.push_back(StackEntry(-1, kNoParent, false));
for (unsigned i = 0; i < die_count; i++) {
// All optional variables need to be reset so we know which ones are set
// by the current DIE.
decl_unit_offset.reset();
decl_global_offset.reset();
const llvm::DWARFDebugInfoEntry* die =
unit->getDIEAtIndex(i).getDebugInfoEntry();
const llvm::DWARFAbbreviationDeclaration* abbrev =
die->getAbbreviationDeclarationPtr();
if (!abbrev)
continue;
// See if we should bother decoding. Decode is the slowest part of the
// indexing so try to avoid it. Here we check the tag and whether the
// abbreviation entry has the required attributes before doing decode since
// this will eliminate the majority of DIEs in typical programs.
//
// Note: Trying to cache whether the abbreviation declaration is of the
// right type (there are a limited number of types of these) doesn't help.
// Checking the abbreviation array is ~6-12 comparisons, which is roughly
// equivalent to [unordered_]map lookup.
bool is_function = false;
bool should_index = false;
if (abbrev->getTag() == llvm::dwarf::DW_TAG_subprogram &&
AbbrevHasCode(abbrev)) {
// Found a function implementation.
is_function = true;
should_index = true;
} else if (!tree_stack.back().inside_function &&
abbrev->getTag() == llvm::dwarf::DW_TAG_variable &&
AbbrevHasLocation(abbrev)) {
// Found variable storage outside of a function (variables inside
// functions are local so don't get added to the global index).
should_index = true;
}
// Add this node to the index.
if (should_index) {
decoder.Decode(*die);
if (decl_unit_offset) {
// Save the declaration for indexing.
symbol_storage->emplace_back(die,
unit->getOffset() + *decl_unit_offset);
} else if (decl_global_offset) {
FXL_NOTREACHED() << "Implement DW_FORM_ref_addr for references.";
} else {
// This function has no separate definition so use it as its own
// declaration (the name and such will be on itself).
symbol_storage->emplace_back(die, die->getOffset());
}
}
// Fix up the parent tracking stack.
StackEntry& tree_stack_back = tree_stack.back();
int current_depth = static_cast<int>(die->getDepth());
if (current_depth == tree_stack_back.depth) {
// Common case: depth not changing. Just update the topmost item in the
// stack to point to the current node.
tree_stack_back.index = i;
} else {
// Tree changed. First check for moving up in the tree and pop the stack
// until we're at the parent of the current level (for going deeper in
// the tree this will do nothing), then add the current level.
while (tree_stack.back().depth >= current_depth)
tree_stack.pop_back();
tree_stack.push_back(StackEntry(
current_depth, i, is_function || tree_stack.back().inside_function));
}
// Save parent info. The parent of this node is the one right before the
// current one (the last one in the stack).
(*parent_indices)[i] = (tree_stack.end() - 2)->index;
}
}
// The per-function part of step 2 of the algorithm described above. This
// finds the definition of the function in the unit's DIEs. It's given a
// map of DIE indices to their parent indices generated for the unit by
// ExtractUnitIndexableEntries for quickly finding parents.
class SymbolStorageIndexer {
public:
SymbolStorageIndexer(llvm::DWARFContext* context, llvm::DWARFUnit* unit,
const std::vector<unsigned>& parent_indices,
ModuleSymbolIndexNode* root)
: unit_(unit),
parent_indices_(parent_indices),
root_(root),
decoder_(context, unit) {
decoder_.AddCString(llvm::dwarf::DW_AT_name, &name_);
}
void AddDIE(const SymbolStorage& impl) {
// Components of the function name in reverse order (so "foo::Bar::Fn")
// would be { "Fn", "Bar", "foo"}
std::vector<std::string> components;
// Find the declaration DIE for the function. Perf note: getDIEForOffset()
// is a binary search.
llvm::DWARFDie die = unit_->getDIEForOffset(impl.definition_unit_offset);
if (!die.isValid())
return; // Invalid
if (!FillName(die))
return;
components.emplace_back(*name_);
unsigned index = unit_->getDIEIndex(die);
while (true) {
// Move up one level in the hierarchy.
FXL_DCHECK(index <= parent_indices_.size());
index = parent_indices_[index];
if (index == kNoParent) {
// Reached the root. In practice this shouldn't happen since following
// the parent chain from a function should always lead to the compile
// unit (handled below).
break;
}
die = unit_->getDIEAtIndex(index);
if (!die.isValid())
return; // Something is corrupted, don't add this function.
if (die.getTag() == llvm::dwarf::DW_TAG_compile_unit)
break; // Reached the root.
// Validate the type of this entry. We don't want to index things
// like functions inside classes locally defined in functions since
// there's no good way to refer to these by global name.
if (die.getTag() != llvm::dwarf::DW_TAG_namespace &&
die.getTag() != llvm::dwarf::DW_TAG_class_type &&
die.getTag() != llvm::dwarf::DW_TAG_structure_type)
return;
if (!FillName(die))
return; // Likely corrupt, these nodes should have names.
components.emplace_back(*name_);
}
// Add the function to the index.
ModuleSymbolIndexNode* cur = root_;
for (int i = static_cast<int>(components.size()) - 1; i >= 0; i--)
cur = cur->AddChild(std::move(components[i]));
cur->AddDie(ModuleSymbolIndexNode::DieRef(impl.entry->getOffset()));
}
private:
// Fills the name_ member for the given DIE. Returns true if the DIE was
// decoded properly and name_ was properly filled in.
bool FillName(const llvm::DWARFDie& die) {
name_.reset();
if (!decoder_.Decode(*die.getDebugInfoEntry()) || !name_)
return false; // Node with no name, skip this function.
return true;
}
llvm::DWARFUnit* unit_;
const std::vector<unsigned>& parent_indices_;
ModuleSymbolIndexNode* root_;
DwarfDieDecoder decoder_;
llvm::Optional<const char*> name_; // Decoder writes into this.
};
} // namespace
ModuleSymbolIndex::ModuleSymbolIndex() = default;
ModuleSymbolIndex::~ModuleSymbolIndex() = default;
void ModuleSymbolIndex::CreateIndex(llvm::object::ObjectFile* object_file) {
std::unique_ptr<llvm::DWARFContext> context = llvm::DWARFContext::create(
*object_file, nullptr, llvm::DWARFContext::defaultErrorHandler);
llvm::DWARFUnitVector compile_units;
context->getDWARFObj().forEachInfoSections(
[&](const llvm::DWARFSection& s) {
compile_units.addUnitsForSection(*context, s, llvm::DW_SECT_INFO);
});
for (unsigned i = 0; i < compile_units.size(); i++) {
IndexCompileUnit(context.get(), compile_units[i].get(), i);
// Free all compilation units as we process them. They will hold all of
// the parsed DIE data that we don't need any more which can be multiple
// GB's for large programs.
compile_units[i].reset();
}
IndexFileNames();
}
size_t ModuleSymbolIndex::CountSymbolsIndexed() const {
return RecursiveCountDies(root_);
}
const std::vector<ModuleSymbolIndexNode::DieRef>& ModuleSymbolIndex::FindExact(
const std::string& input) const {
// Split the input on "::" which we'll traverse the tree with.
//
// TODO(brettw) this doesn't handle a lot of things like templates. By
// blindly splitting on "::" we'll never find functions like
// "std::vector<Foo::Bar>::insert".
std::string separator("::");
const ModuleSymbolIndexNode* cur = &root_;
size_t input_index = 0;
while (input_index < input.size()) {
size_t next = input.find(separator, input_index);
std::string cur_name;
if (next == std::string::npos) {
cur_name = input.substr(input_index);
input_index = input.size();
} else {
cur_name = input.substr(input_index, next - input_index);
input_index = next + separator.size(); // Skip over "::".
}
auto found = cur->sub().find(cur_name);
if (found == cur->sub().end()) {
static std::vector<ModuleSymbolIndexNode::DieRef> empty_vector;
return empty_vector;
}
cur = &found->second;
}
return cur->dies();
}
std::vector<std::string> ModuleSymbolIndex::FindFileMatches(
const std::string& name) const {
std::string_view name_last_comp = ExtractLastFileComponent(name);
std::vector<std::string> result;
// Search all files whose last component matches (the input may contain more
// than one component).
FileNameIndex::const_iterator iter =
file_name_index_.lower_bound(name_last_comp);
while (iter != file_name_index_.end() && iter->first == name_last_comp) {
const auto& pair = *iter->second;
if (StringEndsWith(pair.first, name) &&
(pair.first.size() == name.size() ||
pair.first[pair.first.size() - name.size() - 1] == '/')) {
result.push_back(pair.first);
}
++iter;
}
return result;
}
const std::vector<unsigned>* ModuleSymbolIndex::FindFileUnitIndices(
const std::string& name) const {
auto found = files_.find(name);
if (found == files_.end())
return nullptr;
return &found->second;
}
void ModuleSymbolIndex::DumpFileIndex(std::ostream& out) const {
for (const auto& [filename, file_index_entry]: file_name_index_) {
const auto& [filepath, compilation_units] = *file_index_entry;
out << filename << " -> " << filepath << " -> "
<< compilation_units.size() << " units\n";
}
}
void ModuleSymbolIndex::IndexCompileUnit(llvm::DWARFContext* context,
llvm::DWARFUnit* unit,
unsigned unit_index) {
// Find the things to index.
std::vector<SymbolStorage> symbol_storage;
symbol_storage.reserve(256);
std::vector<unsigned> parent_indices;
ExtractUnitIndexableEntries(context, unit, &symbol_storage, &parent_indices);
// Index each one.
SymbolStorageIndexer indexer(context, unit, parent_indices, &root_);
for (const SymbolStorage& impl : symbol_storage)
indexer.AddDIE(impl);
IndexCompileUnitSourceFiles(context, unit, unit_index);
}
void ModuleSymbolIndex::IndexCompileUnitSourceFiles(llvm::DWARFContext* context,
llvm::DWARFUnit* unit,
unsigned unit_index) {
const llvm::DWARFDebugLine::LineTable* line_table =
context->getLineTableForUnit(unit);
const char* compilation_dir = unit->getCompilationDir();
// This table is the size of the file name table. Entries are set to 1 when
// we've added them to the index already.
std::vector<int> added_file;
added_file.resize(line_table->Prologue.FileNames.size(), 0);
// We don't want to just add all the files from the line table to the index.
// The line table will contain entries for every file referenced by the
// compilation unit, which includes declarations. We want only files that
// contribute code, which in practice is a tiny fraction of the total.
//
// To get this, iterate through the unit's row table and collect all
// referenced file names.
std::string file_name;
for (size_t i = 0; i < line_table->Rows.size(); i++) {
auto file_id = line_table->Rows[i].File; // 1-based!
if (file_id < 1 || file_id > added_file.size())
continue;
auto file_index = file_id - 1;
if (!added_file[file_index]) {
added_file[file_index] = 1;
if (line_table->getFileNameByIndex(
file_id, compilation_dir,
llvm::DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
file_name)) {
// The files here can contain relative components like
// "/foo/bar/../baz". This is OK because we want it to match other
// places in the symbol code that do a similar computation to get a
// file name.
files_[file_name].push_back(unit_index);
}
}
}
}
void ModuleSymbolIndex::IndexFileNames() {
for (FileIndex::const_iterator iter = files_.begin(); iter != files_.end();
++iter) {
std::string_view name = ExtractLastFileComponent(iter->first);
file_name_index_.insert(std::make_pair(name, iter));
}
}
} // namespace zxdb