llvm/examples/ThinLtoJIT/ThinLtoModuleIndex.cpp - third_party/github.com/llvm/llvm-project - Git at Google

 #include "ThinLtoModuleIndex.h"

 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"

 #include <memory>
 #include <string>

 #define DEBUG_TYPE "thinltojit"

 namespace llvm {
 namespace orc {

 Error ThinLtoModuleIndex::add(StringRef InputPath) {
   auto Buffer = errorOrToExpected(MemoryBuffer::getFile(InputPath));
   if (!Buffer)
     return Buffer.takeError();

   Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
                                           CombinedSummaryIndex, NextModuleId);
   if (ParseErr)
     return ParseErr;

 #ifndef NDEBUG
   auto Paths = getAllModulePaths();
   unsigned TotalPaths = Paths.size();
   std::sort(Paths.begin(), Paths.end());
   Paths.erase(std::unique(Paths.begin(), Paths.end()), Paths.end());
   assert(TotalPaths == Paths.size() && "Module paths must be unique");
 #endif

   ++NextModuleId;
   return Error::success();
 }

 std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
   auto ModuleTable = CombinedSummaryIndex.modulePaths();

   std::vector<StringRef> Paths;
   Paths.resize(ModuleTable.size());

   for (const auto &KV : ModuleTable) {
     assert(Paths[KV.second.first].empty() && "IDs are unique and continuous");
     Paths[KV.second.first] = KV.first();
   }

   return Paths;
 }

 GlobalValueSummary *
 ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
   ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
   if (!VI || VI.getSummaryList().empty())
     return nullptr;

   // There can be more than one symbol with the same GUID, in the case of same-
   // named locals in different but same-named source files that were compiled in
   // their respective directories (so the source file name and resulting GUID is
   // the same). We avoid this by checking that module paths are unique upon
   // add().
   //
   // TODO: We can still get duplicates on symbols declared with
   // attribute((weak)), a GNU extension supported by gcc and clang.
   // We should support it by looking for a symbol in the current module
   // or in the same module as the caller.
   assert(VI.getSummaryList().size() == 1 && "Weak symbols not yet supported");

   return VI.getSummaryList().front().get()->getBaseObject();
 }

 Optional<StringRef>
 ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
   if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
     return S->modulePath();
   return None; // We don't know the symbol.
 }

 void ThinLtoModuleIndex::scheduleModuleParsingPrelocked(StringRef Path) {
   // Once the module was scheduled, we can call takeModule().
   auto ScheduledIt = ScheduledModules.find(Path);
   if (ScheduledIt != ScheduledModules.end())
     return;

   auto Worker = [this](std::string Path) {
     if (auto TSM = doParseModule(Path)) {
       std::lock_guard<std::mutex> Lock(ParsedModulesLock);
       ParsedModules[Path] = std::move(*TSM);

       LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n");
     } else {
       ES.reportError(TSM.takeError());
     }
   };

   LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n");
   ScheduledModules[Path] = ParseModuleWorkers.async(Worker, Path.str());
 }

 ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) {
   std::unique_lock<std::mutex> ParseLock(ParsedModulesLock);

   auto ParsedIt = ParsedModules.find(Path);
   if (ParsedIt == ParsedModules.end()) {
     ParseLock.unlock();

     // The module is not ready, wait for the future we stored.
     std::unique_lock<std::mutex> ScheduleLock(ScheduledModulesLock);
     auto ScheduledIt = ScheduledModules.find(Path);
     assert(ScheduledIt != ScheduledModules.end() &&
            "Don't call for unscheduled modules");
     std::shared_future<void> Future = ScheduledIt->getValue();
     ScheduleLock.unlock();
     Future.get();

     ParseLock.lock();
     ParsedIt = ParsedModules.find(Path);
     assert(ParsedIt != ParsedModules.end() && "Must be ready now");
   }

   // We only add each module once. If it's not here anymore, we can skip it.
   ThreadSafeModule TSM = std::move(ParsedIt->getValue());
   ParsedIt->getValue() = ThreadSafeModule();
   return TSM;
 }

 ThreadSafeModule ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) {
   {
     std::lock_guard<std::mutex> ScheduleLock(ScheduledModulesLock);
     scheduleModuleParsingPrelocked(Path);
   }
   return takeModule(Path);
 }

 Expected<ThreadSafeModule> ThinLtoModuleIndex::doParseModule(StringRef Path) {
   // TODO: make a SMDiagnosticError class for this
   SMDiagnostic Err;
   auto Ctx = std::make_unique<LLVMContext>();
   auto M = parseIRFile(Path, Err, *Ctx);
   if (!M) {
     std::string ErrDescription;
     {
       raw_string_ostream S(ErrDescription);
       Err.print("ThinLtoJIT", S);
     }
     return createStringError(inconvertibleErrorCode(),
                              "Failed to load module from file '%s' (%s)",
                              Path.data(), ErrDescription.c_str());
   }

   return ThreadSafeModule(std::move(M), std::move(Ctx));
 }

 // We don't filter visited functions. Discovery will often be retriggered
 // from the middle of already visited functions and it aims to reach a little
 // further each time.
 void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S,
                                                    unsigned LookaheadLevels) {
   // Populate initial worklist
   std::vector<FunctionSummary *> Worklist;
   addToWorklist(Worklist, S->calls());
   unsigned Distance = 0;

   while (++Distance < LookaheadLevels) {
     // Process current worklist and populate a new one.
     std::vector<FunctionSummary *> NextWorklist;
     for (FunctionSummary *F : Worklist) {
       updatePathRank(F->modulePath(), Distance);
       addToWorklist(NextWorklist, F->calls());
     }
     Worklist = std::move(NextWorklist);
   }

   // Process the last worklist without filling a new one
   for (FunctionSummary *F : Worklist) {
     updatePathRank(F->modulePath(), Distance);
   }

   // Reset counts for known paths (includes both, scheduled and parsed modules).
   std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
   for (const auto &KV : ScheduledModules) {
     PathRank[KV.first()].Count = 0;
   }
 }

 void ThinLtoModuleIndex::addToWorklist(
     std::vector<FunctionSummary *> &List,
     ArrayRef<FunctionSummary::EdgeTy> Calls) {
   for (const auto &Edge : Calls) {
     const auto &SummaryList = Edge.first.getSummaryList();
     if (!SummaryList.empty()) {
       GlobalValueSummary *S = SummaryList.front().get()->getBaseObject();
       assert(isa<FunctionSummary>(S) && "Callees must be functions");
       List.push_back(cast<FunctionSummary>(S));
     }
   }
 }

 // PathRank is global and continuous.
 void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) {
   auto &Entry = PathRank[Path];
   Entry.Count += 1;
   Entry.MinDist = std::min(Entry.MinDist, Distance);
   assert(Entry.MinDist > 0 && "We want it as a divisor");
 }

 // TODO: The size of a ThreadPool's task queue is not accessible. It would
 // be great to know in order to estimate how many modules we schedule. The
 // more we schedule, the less precise is the ranking. The less we schedule,
 // the higher the risk for downtime.
 std::vector<std::string> ThinLtoModuleIndex::selectNextPaths() {
   struct ScorePath {
     float Score;
     unsigned MinDist;
     StringRef Path;
   };

   std::vector<ScorePath> Candidates;
   Candidates.reserve(PathRank.size());
   for (const auto &KV : PathRank) {
     float Score = static_cast<float>(KV.second.Count) / KV.second.MinDist;
     if (Score > .0f) {
       Candidates.push_back({Score, KV.second.MinDist, KV.first()});
     }
   }

   // Sort candidates by descending score.
   std::sort(Candidates.begin(), Candidates.end(),
             [](const ScorePath &LHS, const ScorePath &RHS) {
               return LHS.Score > RHS.Score;
             });

   // Sort highest score candidates by ascending minimal distance.
   size_t Selected =
       std::min(std::max<size_t>(NumParseModuleThreads, Candidates.size() / 2),
                Candidates.size());
   std::sort(Candidates.begin(), Candidates.begin() + Selected,
             [](const ScorePath &LHS, const ScorePath &RHS) {
               return LHS.MinDist < RHS.MinDist;
             });

   std::vector<std::string> Paths;
   Paths.reserve(Selected);
   for (unsigned i = 0; i < Selected; i++) {
     Paths.push_back(Candidates[i].Path.str());
   }

   LLVM_DEBUG(dbgs() << "ModuleIndex: select " << Paths.size() << " out of "
                     << Candidates.size() << " discovered paths\n");

   return Paths;
 }

 unsigned ThinLtoModuleIndex::getNumDiscoveredModules() const {
   // TODO: It would probably be more efficient to track the number of
   // unscheduled modules.
   unsigned NonNullItems = 0;
   for (const auto &KV : PathRank)
     if (KV.second.Count > 0)
       ++NonNullItems;
   return NonNullItems;
 }

 } // namespace orc
 } // namespace llvm
	#include "ThinLtoModuleIndex.h"

	#include "llvm/Bitcode/BitcodeReader.h"
	#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IRReader/IRReader.h"
	#include "llvm/Support/SourceMgr.h"
	#include "llvm/Support/raw_ostream.h"

	#include <memory>
	#include <string>

	#define DEBUG_TYPE "thinltojit"

	namespace llvm {
	namespace orc {

	Error ThinLtoModuleIndex::add(StringRef InputPath) {
	auto Buffer = errorOrToExpected(MemoryBuffer::getFile(InputPath));
	if (!Buffer)
	return Buffer.takeError();

	Error ParseErr = readModuleSummaryIndex((*Buffer)->getMemBufferRef(),
	CombinedSummaryIndex, NextModuleId);
	if (ParseErr)
	return ParseErr;

	#ifndef NDEBUG
	auto Paths = getAllModulePaths();
	unsigned TotalPaths = Paths.size();
	std::sort(Paths.begin(), Paths.end());
	Paths.erase(std::unique(Paths.begin(), Paths.end()), Paths.end());
	assert(TotalPaths == Paths.size() && "Module paths must be unique");
	#endif

	++NextModuleId;
	return Error::success();
	}

	std::vector<StringRef> ThinLtoModuleIndex::getAllModulePaths() const {
	auto ModuleTable = CombinedSummaryIndex.modulePaths();

	std::vector<StringRef> Paths;
	Paths.resize(ModuleTable.size());

	for (const auto &KV : ModuleTable) {
	assert(Paths[KV.second.first].empty() && "IDs are unique and continuous");
	Paths[KV.second.first] = KV.first();
	}

	return Paths;
	}

	GlobalValueSummary *
	ThinLtoModuleIndex::getSummary(GlobalValue::GUID Function) const {
	ValueInfo VI = CombinedSummaryIndex.getValueInfo(Function);
	if (!VI \|\| VI.getSummaryList().empty())
	return nullptr;

	// There can be more than one symbol with the same GUID, in the case of same-
	// named locals in different but same-named source files that were compiled in
	// their respective directories (so the source file name and resulting GUID is
	// the same). We avoid this by checking that module paths are unique upon
	// add().
	//
	// TODO: We can still get duplicates on symbols declared with
	// attribute((weak)), a GNU extension supported by gcc and clang.
	// We should support it by looking for a symbol in the current module
	// or in the same module as the caller.
	assert(VI.getSummaryList().size() == 1 && "Weak symbols not yet supported");

	return VI.getSummaryList().front().get()->getBaseObject();
	}

	Optional<StringRef>
	ThinLtoModuleIndex::getModulePathForSymbol(StringRef Name) const {
	if (GlobalValueSummary *S = getSummary(GlobalValue::getGUID(Name)))
	return S->modulePath();
	return None; // We don't know the symbol.
	}

	void ThinLtoModuleIndex::scheduleModuleParsingPrelocked(StringRef Path) {
	// Once the module was scheduled, we can call takeModule().
	auto ScheduledIt = ScheduledModules.find(Path);
	if (ScheduledIt != ScheduledModules.end())
	return;

	auto Worker = [this](std::string Path) {
	if (auto TSM = doParseModule(Path)) {
	std::lock_guard<std::mutex> Lock(ParsedModulesLock);
	ParsedModules[Path] = std::move(*TSM);

	LLVM_DEBUG(dbgs() << "Finished parsing module: " << Path << "\n");
	} else {
	ES.reportError(TSM.takeError());
	}
	};

	LLVM_DEBUG(dbgs() << "Schedule module for parsing: " << Path << "\n");
	ScheduledModules[Path] = ParseModuleWorkers.async(Worker, Path.str());
	}

	ThreadSafeModule ThinLtoModuleIndex::takeModule(StringRef Path) {
	std::unique_lock<std::mutex> ParseLock(ParsedModulesLock);

	auto ParsedIt = ParsedModules.find(Path);
	if (ParsedIt == ParsedModules.end()) {
	ParseLock.unlock();

	// The module is not ready, wait for the future we stored.
	std::unique_lock<std::mutex> ScheduleLock(ScheduledModulesLock);
	auto ScheduledIt = ScheduledModules.find(Path);
	assert(ScheduledIt != ScheduledModules.end() &&
	"Don't call for unscheduled modules");
	std::shared_future<void> Future = ScheduledIt->getValue();
	ScheduleLock.unlock();
	Future.get();

	ParseLock.lock();
	ParsedIt = ParsedModules.find(Path);
	assert(ParsedIt != ParsedModules.end() && "Must be ready now");
	}

	// We only add each module once. If it's not here anymore, we can skip it.
	ThreadSafeModule TSM = std::move(ParsedIt->getValue());
	ParsedIt->getValue() = ThreadSafeModule();
	return TSM;
	}

	ThreadSafeModule ThinLtoModuleIndex::parseModuleFromFile(StringRef Path) {
	{
	std::lock_guard<std::mutex> ScheduleLock(ScheduledModulesLock);
	scheduleModuleParsingPrelocked(Path);
	}
	return takeModule(Path);
	}

	Expected<ThreadSafeModule> ThinLtoModuleIndex::doParseModule(StringRef Path) {
	// TODO: make a SMDiagnosticError class for this
	SMDiagnostic Err;
	auto Ctx = std::make_unique<LLVMContext>();
	auto M = parseIRFile(Path, Err, *Ctx);
	if (!M) {
	std::string ErrDescription;
	{
	raw_string_ostream S(ErrDescription);
	Err.print("ThinLtoJIT", S);
	}
	return createStringError(inconvertibleErrorCode(),
	"Failed to load module from file '%s' (%s)",
	Path.data(), ErrDescription.c_str());
	}

	return ThreadSafeModule(std::move(M), std::move(Ctx));
	}

	// We don't filter visited functions. Discovery will often be retriggered
	// from the middle of already visited functions and it aims to reach a little
	// further each time.
	void ThinLtoModuleIndex::discoverCalleeModulePaths(FunctionSummary *S,
	unsigned LookaheadLevels) {
	// Populate initial worklist
	std::vector<FunctionSummary *> Worklist;
	addToWorklist(Worklist, S->calls());
	unsigned Distance = 0;

	while (++Distance < LookaheadLevels) {
	// Process current worklist and populate a new one.
	std::vector<FunctionSummary *> NextWorklist;
	for (FunctionSummary *F : Worklist) {
	updatePathRank(F->modulePath(), Distance);
	addToWorklist(NextWorklist, F->calls());
	}
	Worklist = std::move(NextWorklist);
	}

	// Process the last worklist without filling a new one
	for (FunctionSummary *F : Worklist) {
	updatePathRank(F->modulePath(), Distance);
	}

	// Reset counts for known paths (includes both, scheduled and parsed modules).
	std::lock_guard<std::mutex> Lock(ScheduledModulesLock);
	for (const auto &KV : ScheduledModules) {
	PathRank[KV.first()].Count = 0;
	}
	}

	void ThinLtoModuleIndex::addToWorklist(
	std::vector<FunctionSummary *> &List,
	ArrayRef<FunctionSummary::EdgeTy> Calls) {
	for (const auto &Edge : Calls) {
	const auto &SummaryList = Edge.first.getSummaryList();
	if (!SummaryList.empty()) {
	GlobalValueSummary *S = SummaryList.front().get()->getBaseObject();
	assert(isa<FunctionSummary>(S) && "Callees must be functions");
	List.push_back(cast<FunctionSummary>(S));
	}
	}
	}

	// PathRank is global and continuous.
	void ThinLtoModuleIndex::updatePathRank(StringRef Path, unsigned Distance) {
	auto &Entry = PathRank[Path];
	Entry.Count += 1;
	Entry.MinDist = std::min(Entry.MinDist, Distance);
	assert(Entry.MinDist > 0 && "We want it as a divisor");
	}

	// TODO: The size of a ThreadPool's task queue is not accessible. It would
	// be great to know in order to estimate how many modules we schedule. The
	// more we schedule, the less precise is the ranking. The less we schedule,
	// the higher the risk for downtime.
	std::vector<std::string> ThinLtoModuleIndex::selectNextPaths() {
	struct ScorePath {
	float Score;
	unsigned MinDist;
	StringRef Path;
	};

	std::vector<ScorePath> Candidates;
	Candidates.reserve(PathRank.size());
	for (const auto &KV : PathRank) {
	float Score = static_cast<float>(KV.second.Count) / KV.second.MinDist;
	if (Score > .0f) {
	Candidates.push_back({Score, KV.second.MinDist, KV.first()});
	}
	}

	// Sort candidates by descending score.
	std::sort(Candidates.begin(), Candidates.end(),
	[](const ScorePath &LHS, const ScorePath &RHS) {
	return LHS.Score > RHS.Score;
	});

	// Sort highest score candidates by ascending minimal distance.
	size_t Selected =
	std::min(std::max<size_t>(NumParseModuleThreads, Candidates.size() / 2),
	Candidates.size());
	std::sort(Candidates.begin(), Candidates.begin() + Selected,
	[](const ScorePath &LHS, const ScorePath &RHS) {
	return LHS.MinDist < RHS.MinDist;
	});

	std::vector<std::string> Paths;
	Paths.reserve(Selected);
	for (unsigned i = 0; i < Selected; i++) {
	Paths.push_back(Candidates[i].Path.str());
	}

	LLVM_DEBUG(dbgs() << "ModuleIndex: select " << Paths.size() << " out of "
	<< Candidates.size() << " discovered paths\n");

	return Paths;
	}

	unsigned ThinLtoModuleIndex::getNumDiscoveredModules() const {
	// TODO: It would probably be more efficient to track the number of
	// unscheduled modules.
	unsigned NonNullItems = 0;
	for (const auto &KV : PathRank)
	if (KV.second.Count > 0)
	++NonNullItems;
	return NonNullItems;
	}

	} // namespace orc
	} // namespace llvm