// Copyright 2018 The Fuchsia Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "garnet/bin/zxdb/symbols/module_symbol_index.h"

#include <limits>

#include "garnet/bin/zxdb/common/file_util.h"
#include "garnet/bin/zxdb/common/string_util.h"
#include "garnet/bin/zxdb/symbols/dwarf_die_decoder.h"
#include "garnet/bin/zxdb/symbols/module_symbol_index_node.h"
#include "garnet/public/lib/fxl/logging.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
#include "llvm/DebugInfo/DWARF/DWARFUnit.h"

namespace zxdb {

namespace {

// We want to index the things that may need to be referenced globally: global
// variables, file and class static variables, and function implementations.
//
// Indexable functions are the DW_TAG_subprogram entries that have a range of
// code. These implementations won't always have the full type information,
// when the declaration is separate from the implementation, the implementation
// will reference the separate declaration node. The declaration of the
// function will contain the name and have the proper nesting inside classes
// and namespaces, etc. according to the structure of the original code.
//
// Variables work similarly. A global variable will often have a separate
// declaration (in the proper namespaces) and storage (often outside of
// namespaces), but file-level statics with the declaration and storage
// declared all-in-one will have one entry representing everything.
//
// In a compile unit (basically one object file), there will likely be lots of
// declarations from all the headers, and a smaller number of actual function
// definitions and variable storage.
//
// From a high level, we want to search the DIEs for the implementations and
// variable storage which is the stuff that will need to be referenced from
// the global context in the debugger.
//
// Then we follow the link to their definition (if separate from the
// implementation), then walk up the tree to get the full class and namespacing
// information. But walking the tree upwards requires lots of linear searching
// since the tree is stored in a flat array.
//
// To index efficiently, do two passes:
//  1. Walk linearly through all DIEs:
//     1a. Find the ones we're interested in and save the information.
//     1b. For each one, save the index of the parent so we can efficiently
//         walk up the tree in pass 2.
//  2. Resolve the full type information for each function:
//     2a. Find the declaration for each function implementation DIE.
//     2b. Walk that declaration up to get the full context.
//     2c. Index that.
//
// Performance note: Having the unit extract its DIEs via DWARFUnit::dies() and
// DWARFUnit::getNumDIEs() basically iterates through the whole table, which
// we then do again here. We can probably speed things up by eliminating this
// call, calling unit.getUnitDIE(), and manually iterating the children of
// that.

// The SymbolStorage stores the information from the "implementation" of a
// symbol (a function DIE that has code or a variable that has a location),
// representing something we want to index. The entry will always refer to the
// DIE for the implementation, and the offset will refer to the offset of the
// DIE for the definition.
//
// Some functions and variables have separate definitions, and some don't. If
// the definition and implementation is the same, the offset will just point to
// the entry.
struct SymbolStorage {
  SymbolStorage(const llvm::DWARFDebugInfoEntry* e, uint64_t offs)
      : entry(e), definition_unit_offset(offs) {}

  const llvm::DWARFDebugInfoEntry* entry;
  uint64_t definition_unit_offset;
};

// Index used to indicate there is no parent.
constexpr unsigned kNoParent = std::numeric_limits<unsigned>::max();

// Returns true if the given abbreviation defines a PC range.
bool AbbrevHasCode(const llvm::DWARFAbbreviationDeclaration* abbrev) {
  for (const auto spec : abbrev->attributes()) {
    if (spec.Attr == llvm::dwarf::DW_AT_low_pc ||
        spec.Attr == llvm::dwarf::DW_AT_high_pc)
      return true;
  }
  return false;
}

// Returns true if the given abbreviation defines a "location".
bool AbbrevHasLocation(const llvm::DWARFAbbreviationDeclaration* abbrev) {
  for (const auto spec : abbrev->attributes()) {
    if (spec.Attr == llvm::dwarf::DW_AT_location)
      return true;
  }
  return false;
}

size_t RecursiveCountDies(const ModuleSymbolIndexNode& node) {
  size_t result = node.dies().size();
  for (const auto& pair : node.sub())
    result += RecursiveCountDies(pair.second);
  return result;
}

// Step 1 of the algorithm above. Fills the symbol_storage array with the
// information for all function implementations (ones with addresses). Fills
// the parent_indices array with the index of the parent of each DIE in the
// unit (it will be exactly unit->getNumDIEs() long). The root node will have
// kNoParent set.
void ExtractUnitIndexableEntries(llvm::DWARFContext* context,
                                 llvm::DWARFUnit* unit,
                                 std::vector<SymbolStorage>* symbol_storage,
                                 std::vector<unsigned>* parent_indices) {
  DwarfDieDecoder decoder(context, unit);

  // The offset of the declaration. This can be unit-relative or file-absolute.
  // This code doesn't implement the file-absolute variant which it seems our
  // toolchain doesn't generate. To implement I'm thinking everything
  // with an absolute offset will be put into a global list and processed in a
  // third pass once all units are processed. This third pass will be slower
  // since probably we won't do any optimized lookups.
  llvm::Optional<uint64_t> decl_unit_offset;
  llvm::Optional<uint64_t> decl_global_offset;
  decoder.AddReference(llvm::dwarf::DW_AT_specification, &decl_unit_offset,
                       &decl_global_offset);

  // Stores the index of the parent DIE for each one we encounter. The root
  // DIE with no parent will be set to kNoParent.
  unsigned die_count = unit->getNumDIEs();
  parent_indices->resize(die_count);

  // Stores the list of parent indices according to the current depth in the
  // tree. At any given point, the parent index of the current node will be
  // tree_stack.back(). inside_function should be set if this node or any
  // parent node is a function.
  struct StackEntry {
    StackEntry(int d, unsigned i, bool f)
        : depth(d), index(i), inside_function(f) {}

    int depth;
    unsigned index;
    bool inside_function;
  };
  std::vector<StackEntry> tree_stack;
  tree_stack.reserve(8);
  tree_stack.push_back(StackEntry(-1, kNoParent, false));

  for (unsigned i = 0; i < die_count; i++) {
    // All optional variables need to be reset so we know which ones are set
    // by the current DIE.
    decl_unit_offset.reset();
    decl_global_offset.reset();

    const llvm::DWARFDebugInfoEntry* die =
        unit->getDIEAtIndex(i).getDebugInfoEntry();
    const llvm::DWARFAbbreviationDeclaration* abbrev =
        die->getAbbreviationDeclarationPtr();
    if (!abbrev)
      continue;

    // See if we should bother decoding. Decode is the slowest part of the
    // indexing so try to avoid it. Here we check the tag and whether the
    // abbreviation entry has the required attributes before doing decode since
    // this will eliminate the majority of DIEs in typical programs.
    //
    // Note: Trying to cache whether the abbreviation declaration is of the
    // right type (there are a limited number of types of these) doesn't help.
    // Checking the abbreviation array is ~6-12 comparisons, which is roughly
    // equivalent to [unordered_]map lookup.
    bool is_function = false;
    bool should_index = false;
    if (abbrev->getTag() == llvm::dwarf::DW_TAG_subprogram &&
        AbbrevHasCode(abbrev)) {
      // Found a function implementation.
      is_function = true;
      should_index = true;
    } else if (!tree_stack.back().inside_function &&
               abbrev->getTag() == llvm::dwarf::DW_TAG_variable &&
               AbbrevHasLocation(abbrev)) {
      // Found variable storage outside of a function (variables inside
      // functions are local so don't get added to the global index).
      should_index = true;
    }

    // Add this node to the index.
    if (should_index) {
      decoder.Decode(*die);
      if (decl_unit_offset) {
        // Save the declaration for indexing.
        symbol_storage->emplace_back(die,
                                     unit->getOffset() + *decl_unit_offset);
      } else if (decl_global_offset) {
        FXL_NOTREACHED() << "Implement DW_FORM_ref_addr for references.";
      } else {
        // This function has no separate definition so use it as its own
        // declaration (the name and such will be on itself).
        symbol_storage->emplace_back(die, die->getOffset());
      }
    }

    // Fix up the parent tracking stack.
    StackEntry& tree_stack_back = tree_stack.back();
    int current_depth = static_cast<int>(die->getDepth());
    if (current_depth == tree_stack_back.depth) {
      // Common case: depth not changing. Just update the topmost item in the
      // stack to point to the current node.
      tree_stack_back.index = i;
    } else {
      // Tree changed. First check for moving up in the tree and pop the stack
      // until we're at the parent of the current level (for going deeper in
      // the tree this will do nothing), then add the current level.
      while (tree_stack.back().depth >= current_depth)
        tree_stack.pop_back();

      tree_stack.push_back(StackEntry(
          current_depth, i, is_function || tree_stack.back().inside_function));
    }

    // Save parent info. The parent of this node is the one right before the
    // current one (the last one in the stack).
    (*parent_indices)[i] = (tree_stack.end() - 2)->index;
  }
}

// The per-function part of step 2 of the algorithm described above. This
// finds the definition of the function in the unit's DIEs. It's given a
// map of DIE indices to their parent indices generated for the unit by
// ExtractUnitIndexableEntries for quickly finding parents.
class SymbolStorageIndexer {
 public:
  SymbolStorageIndexer(llvm::DWARFContext* context, llvm::DWARFUnit* unit,
                       const std::vector<unsigned>& parent_indices,
                       ModuleSymbolIndexNode* root)
      : unit_(unit),
        parent_indices_(parent_indices),
        root_(root),
        decoder_(context, unit) {
    decoder_.AddCString(llvm::dwarf::DW_AT_name, &name_);
  }

  void AddDIE(const SymbolStorage& impl) {
    // Components of the function name in reverse order (so "foo::Bar::Fn")
    // would be { "Fn", "Bar", "foo"}
    std::vector<std::string> components;

    // Find the declaration DIE for the function. Perf note: getDIEForOffset()
    // is a binary search.
    llvm::DWARFDie die = unit_->getDIEForOffset(impl.definition_unit_offset);
    if (!die.isValid())
      return;  // Invalid
    if (!FillName(die))
      return;
    components.emplace_back(*name_);

    unsigned index = unit_->getDIEIndex(die);
    while (true) {
      // Move up one level in the hierarchy.
      FXL_DCHECK(index <= parent_indices_.size());
      index = parent_indices_[index];
      if (index == kNoParent) {
        // Reached the root. In practice this shouldn't happen since following
        // the parent chain from a function should always lead to the compile
        // unit (handled below).
        break;
      }

      die = unit_->getDIEAtIndex(index);
      if (!die.isValid())
        return;  // Something is corrupted, don't add this function.

      if (die.getTag() == llvm::dwarf::DW_TAG_compile_unit)
        break;  // Reached the root.

      // Validate the type of this entry. We don't want to index things
      // like functions inside classes locally defined in functions since
      // there's no good way to refer to these by global name.
      if (die.getTag() != llvm::dwarf::DW_TAG_namespace &&
          die.getTag() != llvm::dwarf::DW_TAG_class_type &&
          die.getTag() != llvm::dwarf::DW_TAG_structure_type)
        return;

      if (!FillName(die))
        return;  // Likely corrupt, these nodes should have names.
      components.emplace_back(*name_);
    }

    // Add the function to the index.
    ModuleSymbolIndexNode* cur = root_;
    for (int i = static_cast<int>(components.size()) - 1; i >= 0; i--)
      cur = cur->AddChild(std::move(components[i]));
    cur->AddDie(ModuleSymbolIndexNode::DieRef(impl.entry->getOffset()));
  }

 private:
  // Fills the name_ member for the given DIE. Returns true if the DIE was
  // decoded properly and name_ was properly filled in.
  bool FillName(const llvm::DWARFDie& die) {
    name_.reset();
    if (!decoder_.Decode(*die.getDebugInfoEntry()) || !name_)
      return false;  // Node with no name, skip this function.
    return true;
  }

  llvm::DWARFUnit* unit_;
  const std::vector<unsigned>& parent_indices_;
  ModuleSymbolIndexNode* root_;

  DwarfDieDecoder decoder_;
  llvm::Optional<const char*> name_;  // Decoder writes into this.
};

}  // namespace

ModuleSymbolIndex::ModuleSymbolIndex() = default;
ModuleSymbolIndex::~ModuleSymbolIndex() = default;

void ModuleSymbolIndex::CreateIndex(llvm::object::ObjectFile* object_file) {
  std::unique_ptr<llvm::DWARFContext> context = llvm::DWARFContext::create(
      *object_file, nullptr, llvm::DWARFContext::defaultErrorHandler);

  llvm::DWARFUnitVector compile_units;
  context->getDWARFObj().forEachInfoSections(
      [&](const llvm::DWARFSection& s) {
        compile_units.addUnitsForSection(*context, s, llvm::DW_SECT_INFO);
      });

  for (unsigned i = 0; i < compile_units.size(); i++) {
    IndexCompileUnit(context.get(), compile_units[i].get(), i);

    // Free all compilation units as we process them. They will hold all of
    // the parsed DIE data that we don't need any more which can be multiple
    // GB's for large programs.
    compile_units[i].reset();
  }

  IndexFileNames();
}

size_t ModuleSymbolIndex::CountSymbolsIndexed() const {
  return RecursiveCountDies(root_);
}

const std::vector<ModuleSymbolIndexNode::DieRef>& ModuleSymbolIndex::FindExact(
    const std::string& input) const {
  // Split the input on "::" which we'll traverse the tree with.
  //
  // TODO(brettw) this doesn't handle a lot of things like templates. By
  // blindly splitting on "::" we'll never find functions like
  // "std::vector<Foo::Bar>::insert".
  std::string separator("::");

  const ModuleSymbolIndexNode* cur = &root_;

  size_t input_index = 0;
  while (input_index < input.size()) {
    size_t next = input.find(separator, input_index);

    std::string cur_name;
    if (next == std::string::npos) {
      cur_name = input.substr(input_index);
      input_index = input.size();
    } else {
      cur_name = input.substr(input_index, next - input_index);
      input_index = next + separator.size();  // Skip over "::".
    }

    auto found = cur->sub().find(cur_name);
    if (found == cur->sub().end()) {
      static std::vector<ModuleSymbolIndexNode::DieRef> empty_vector;
      return empty_vector;
    }

    cur = &found->second;
  }

  return cur->dies();
}

std::vector<std::string> ModuleSymbolIndex::FindFileMatches(
    const std::string& name) const {
  std::string_view name_last_comp = ExtractLastFileComponent(name);

  std::vector<std::string> result;

  // Search all files whose last component matches (the input may contain more
  // than one component).
  FileNameIndex::const_iterator iter =
      file_name_index_.lower_bound(name_last_comp);
  while (iter != file_name_index_.end() && iter->first == name_last_comp) {
    const auto& pair = *iter->second;
    if (StringEndsWith(pair.first, name) &&
        (pair.first.size() == name.size() ||
         pair.first[pair.first.size() - name.size() - 1] == '/')) {
      result.push_back(pair.first);
    }
    ++iter;
  }

  return result;
}

const std::vector<unsigned>* ModuleSymbolIndex::FindFileUnitIndices(
    const std::string& name) const {
  auto found = files_.find(name);
  if (found == files_.end())
    return nullptr;
  return &found->second;
}

void ModuleSymbolIndex::DumpFileIndex(std::ostream& out) const {
  for (const auto& [filename, file_index_entry]: file_name_index_) {
    const auto& [filepath, compilation_units] = *file_index_entry;
    out << filename << " -> " << filepath << " -> "
        << compilation_units.size() << " units\n";
  }
}

void ModuleSymbolIndex::IndexCompileUnit(llvm::DWARFContext* context,
                                         llvm::DWARFUnit* unit,
                                         unsigned unit_index) {
  // Find the things to index.
  std::vector<SymbolStorage> symbol_storage;
  symbol_storage.reserve(256);
  std::vector<unsigned> parent_indices;
  ExtractUnitIndexableEntries(context, unit, &symbol_storage, &parent_indices);

  // Index each one.
  SymbolStorageIndexer indexer(context, unit, parent_indices, &root_);
  for (const SymbolStorage& impl : symbol_storage)
    indexer.AddDIE(impl);

  IndexCompileUnitSourceFiles(context, unit, unit_index);
}

void ModuleSymbolIndex::IndexCompileUnitSourceFiles(llvm::DWARFContext* context,
                                                    llvm::DWARFUnit* unit,
                                                    unsigned unit_index) {
  const llvm::DWARFDebugLine::LineTable* line_table =
      context->getLineTableForUnit(unit);
  const char* compilation_dir = unit->getCompilationDir();

  // This table is the size of the file name table. Entries are set to 1 when
  // we've added them to the index already.
  std::vector<int> added_file;
  added_file.resize(line_table->Prologue.FileNames.size(), 0);

  // We don't want to just add all the files from the line table to the index.
  // The line table will contain entries for every file referenced by the
  // compilation unit, which includes declarations. We want only files that
  // contribute code, which in practice is a tiny fraction of the total.
  //
  // To get this, iterate through the unit's row table and collect all
  // referenced file names.
  std::string file_name;
  for (size_t i = 0; i < line_table->Rows.size(); i++) {
    auto file_id = line_table->Rows[i].File;  // 1-based!
    if (file_id < 1 || file_id > added_file.size())
      continue;
    auto file_index = file_id - 1;

    if (!added_file[file_index]) {
      added_file[file_index] = 1;
      if (line_table->getFileNameByIndex(
              file_id, compilation_dir,
              llvm::DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
              file_name)) {
        // The files here can contain relative components like
        // "/foo/bar/../baz". This is OK because we want it to match other
        // places in the symbol code that do a similar computation to get a
        // file name.
        files_[file_name].push_back(unit_index);
      }
    }
  }
}

void ModuleSymbolIndex::IndexFileNames() {
  for (FileIndex::const_iterator iter = files_.begin(); iter != files_.end();
       ++iter) {
    std::string_view name = ExtractLastFileComponent(iter->first);
    file_name_index_.insert(std::make_pair(name, iter));
  }
}

}  // namespace zxdb
