[𝘀𝗽𝗿] changes to main this commit is based on

Created using spr 1.3.4

[skip ci]
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index a529985..e82b857 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -386,8 +386,8 @@
   /// Profile match ratio.
   float ProfileMatchRatio{0.0f};
 
-  /// Raw branch count for this function in the profile.
-  uint64_t RawBranchCount{0};
+  /// Raw sample/branch count for this function in the profile.
+  uint64_t RawSampleCount{0};
 
   /// Dynamically executed function bytes, used for density computation.
   uint64_t SampleCountInBytes{0};
@@ -1880,13 +1880,12 @@
   /// Return COUNT_NO_PROFILE if there's no profile info.
   uint64_t getExecutionCount() const { return ExecutionCount; }
 
-  /// Return the raw profile information about the number of branch
-  /// executions corresponding to this function.
-  uint64_t getRawBranchCount() const { return RawBranchCount; }
+  /// Return the raw profile information about the number of samples (basic
+  /// profile) or branch executions (branch profile) recorded in this function.
+  uint64_t getRawSampleCount() const { return RawSampleCount; }
 
-  /// Set the profile data about the number of branch executions corresponding
-  /// to this function.
-  void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
+  /// Set raw count of samples or branches recorded in this function.
+  void setRawSampleCount(uint64_t Count) { RawSampleCount = Count; }
 
   /// Return the number of dynamically executed bytes, from raw perf data.
   uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index c4ee75e..d66d198 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -92,16 +92,6 @@
     uint64_t Addr;
   };
 
-  /// Used for parsing specific pre-aggregated input files.
-  struct AggregatedLBREntry {
-    enum Type : char { BRANCH = 0, FT, FT_EXTERNAL_ORIGIN, TRACE };
-    Location From;
-    Location To;
-    uint64_t Count;
-    uint64_t Mispreds;
-    Type EntryType;
-  };
-
   struct Trace {
     uint64_t From;
     uint64_t To;
@@ -131,7 +121,6 @@
   /// and use them later for processing and assigning profile.
   std::unordered_map<Trace, TakenBranchInfo, TraceHash> BranchLBRs;
   std::unordered_map<Trace, FTInfo, TraceHash> FallthroughLBRs;
-  std::vector<AggregatedLBREntry> AggregatedLBRs;
   std::unordered_map<uint64_t, uint64_t> BasicSamples;
   std::vector<PerfMemSample> MemSamples;
 
@@ -416,14 +405,7 @@
   /// F 41be90 41be90 4
   /// B 4b1942 39b57f0 3 0
   /// B 4b196f 4b19e0 2 0
-  void parsePreAggregated();
-
-  /// Parse the full output of pre-aggregated LBR samples generated by
-  /// an external tool.
-  std::error_code parsePreAggregatedLBRSamples();
-
-  /// Process parsed pre-aggregated data.
-  void processPreAggregated();
+  std::error_code parsePreAggregated();
 
   /// If \p Address falls into the binary address space based on memory
   /// mapping info \p MMI, then adjust it for further processing by subtracting
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 314dcc9..a7a0933 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -252,6 +252,9 @@
   /// Get the number of samples recorded in [Start, End)
   uint64_t getSamples(uint64_t Start, uint64_t End) const;
 
+  /// Returns the total number of samples recorded in this function.
+  uint64_t getSamples() const;
+
   /// Aggregation helper
   DenseMap<uint64_t, size_t> Index;
 
diff --git a/bolt/include/bolt/Profile/Heatmap.h b/bolt/include/bolt/Profile/Heatmap.h
index 74d7eed..a63b221 100644
--- a/bolt/include/bolt/Profile/Heatmap.h
+++ b/bolt/include/bolt/Profile/Heatmap.h
@@ -9,6 +9,7 @@
 #ifndef BOLT_PROFILE_HEATMAP_H
 #define BOLT_PROFILE_HEATMAP_H
 
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include <cstdint>
 #include <map>
@@ -57,9 +58,9 @@
   }
 
   /// Register a single sample at \p Address.
-  void registerAddress(uint64_t Address) {
+  void registerAddress(uint64_t Address, uint64_t Count) {
     if (!ignoreAddress(Address))
-      ++Map[Address / BucketSize];
+      Map[Address / BucketSize] += Count;
   }
 
   /// Register \p Count samples at [\p StartAddress, \p EndAddress ].
@@ -77,9 +78,22 @@
 
   void printCDF(raw_ostream &OS) const;
 
-  void printSectionHotness(StringRef Filename) const;
+  /// Struct describing individual section hotness.
+  struct SectionStats {
+    uint64_t Samples{0};
+    uint64_t Buckets{0};
+  };
 
-  void printSectionHotness(raw_ostream &OS) const;
+  /// Mapping from section name to associated \p SectionStats. Special entries:
+  /// - [total] for total stats,
+  /// - [unmapped] for samples outside any section, if non-zero.
+  using SectionStatsMap = StringMap<SectionStats>;
+
+  SectionStatsMap computeSectionStats() const;
+
+  void printSectionHotness(const SectionStatsMap &, StringRef Filename) const;
+
+  void printSectionHotness(const SectionStatsMap &, raw_ostream &OS) const;
 
   size_t size() const { return Map.size(); }
 };
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 9773e21..fc521dc 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -473,7 +473,7 @@
     OS << "\n  Image       : 0x" << Twine::utohexstr(getImageAddress());
   if (ExecutionCount != COUNT_NO_PROFILE) {
     OS << "\n  Exec Count  : " << ExecutionCount;
-    OS << "\n  Branch Count: " << RawBranchCount;
+    OS << "\n  Branch Count: " << RawSampleCount;
     OS << "\n  Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
   }
 
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index d8628c6..420ffc8 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -1445,7 +1445,7 @@
     if (!Function.hasProfile())
       continue;
 
-    uint64_t SampleCount = Function.getRawBranchCount();
+    uint64_t SampleCount = Function.getRawSampleCount();
     TotalSampleCount += SampleCount;
 
     if (Function.hasValidProfile()) {
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 80f4ea0..11850fa 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -349,25 +349,29 @@
   return false;
 }
 
-void DataAggregator::parsePreAggregated() {
-  std::string Error;
+std::error_code DataAggregator::parsePreAggregated() {
+  outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
+  NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
+                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> MB =
       MemoryBuffer::getFileOrSTDIN(Filename);
-  if (std::error_code EC = MB.getError()) {
-    errs() << "PERF2BOLT-ERROR: cannot open " << Filename << ": "
-           << EC.message() << "\n";
-    exit(1);
-  }
+  if (std::error_code EC = MB.getError())
+    return EC;
 
   FileBuf = std::move(*MB);
   ParsingBuf = FileBuf->getBuffer();
   Col = 0;
   Line = 1;
-  if (parsePreAggregatedLBRSamples()) {
-    errs() << "PERF2BOLT: failed to parse samples\n";
-    exit(1);
+  size_t AggregatedLBRs = 0;
+  while (hasData()) {
+    if (std::error_code EC = parseAggregatedLBREntry())
+      return EC;
+    ++AggregatedLBRs;
   }
+
+  outs() << "PERF2BOLT: read " << AggregatedLBRs << " aggregated LBR entries\n";
+  return std::error_code();
 }
 
 void DataAggregator::filterBinaryMMapInfo() {
@@ -446,11 +450,6 @@
 Error DataAggregator::preprocessProfile(BinaryContext &BC) {
   this->BC = &BC;
 
-  if (opts::ReadPreAggregated) {
-    parsePreAggregated();
-    return Error::success();
-  }
-
   if (std::optional<StringRef> FileBuildID = BC.getFileBuildID()) {
     outs() << "BOLT-INFO: binary build-id is:     " << *FileBuildID << "\n";
     processFileBuildID(*FileBuildID);
@@ -471,6 +470,12 @@
       ErrorCallback(ReturnCode, ErrBuf);
   };
 
+  if (opts::ReadPreAggregated) {
+    if (std::error_code EC = parsePreAggregated())
+      return errorCodeToError(EC);
+    goto heatmap;
+  }
+
   if (BC.IsLinuxKernel) {
     // Current MMap parsing logic does not work with linux kernel.
     // MMap entries for linux kernel uses PERF_RECORD_MMAP
@@ -499,16 +504,7 @@
   filterBinaryMMapInfo();
   prepareToParse("events", MainEventsPPI, ErrorCallback);
 
-  if (opts::HeatmapMode) {
-    if (std::error_code EC = printLBRHeatMap()) {
-      errs() << "ERROR: failed to print heat map: " << EC.message() << '\n';
-      exit(1);
-    }
-    exit(0);
-  }
-
-  if ((!opts::BasicAggregation && parseBranchEvents()) ||
-      (opts::BasicAggregation && parseBasicEvents()))
+  if (opts::BasicAggregation ? parseBasicEvents() : parseBranchEvents())
     errs() << "PERF2BOLT: failed to parse samples\n";
 
   // Special handling for memory events
@@ -521,6 +517,13 @@
 
   deleteTempFiles();
 
+heatmap:
+  if (opts::HeatmapMode) {
+    if (std::error_code EC = printLBRHeatMap())
+      return errorCodeToError(EC);
+    exit(0);
+  }
+
   return Error::success();
 }
 
@@ -557,9 +560,7 @@
 }
 
 void DataAggregator::processProfile(BinaryContext &BC) {
-  if (opts::ReadPreAggregated)
-    processPreAggregated();
-  else if (opts::BasicAggregation)
+  if (opts::BasicAggregation)
     processBasicEvents();
   else
     processBranchEvents();
@@ -567,15 +568,14 @@
   processMemEvents();
 
   // Mark all functions with registered events as having a valid profile.
-  const auto Flags = opts::BasicAggregation ? BinaryFunction::PF_SAMPLE
-                                            : BinaryFunction::PF_LBR;
   for (auto &BFI : BC.getBinaryFunctions()) {
     BinaryFunction &BF = BFI.second;
-    FuncBranchData *FBD = getBranchData(BF);
-    if (FBD || getFuncSampleData(BF.getNames())) {
-      BF.markProfiled(Flags);
-      if (FBD)
-        BF.RawBranchCount = FBD->getNumExecutedBranches();
+    if (FuncBranchData *FBD = getBranchData(BF)) {
+      BF.markProfiled(BinaryFunction::PF_LBR);
+      BF.RawSampleCount = FBD->getNumExecutedBranches();
+    } else if (FuncSampleData *FSD = getFuncSampleData(BF.getNames())) {
+      BF.markProfiled(BinaryFunction::PF_SAMPLE);
+      BF.RawSampleCount = FSD->getSamples();
     }
   }
 
@@ -588,7 +588,6 @@
   // Release intermediate storage.
   clear(BranchLBRs);
   clear(FallthroughLBRs);
-  clear(AggregatedLBRs);
   clear(BasicSamples);
   clear(MemSamples);
 }
@@ -632,10 +631,18 @@
 
 bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address,
                               uint64_t Count) {
+  // To record executed bytes, use basic block size as is regardless of BAT.
+  uint64_t BlockSize = 0;
+  if (BinaryBasicBlock *BB = OrigFunc.getBasicBlockContainingOffset(
+          Address - OrigFunc.getAddress()))
+    BlockSize = BB->getOriginalSize();
+
   BinaryFunction *ParentFunc = getBATParentFunction(OrigFunc);
   BinaryFunction &Func = ParentFunc ? *ParentFunc : OrigFunc;
-  if (ParentFunc || (BAT && !BAT->isBATFunction(OrigFunc.getAddress())))
+  if (ParentFunc || (BAT && !BAT->isBATFunction(Func.getAddress())))
     NumColdSamples += Count;
+  // Attach executed bytes to parent function in case of cold fragment.
+  Func.SampleCountInBytes += Count * BlockSize;
 
   auto I = NamesToSamples.find(Func.getOneName());
   if (I == NamesToSamples.end()) {
@@ -1209,15 +1216,14 @@
   ErrorOr<StringRef> TypeOrErr = parseString(FieldSeparator);
   if (std::error_code EC = TypeOrErr.getError())
     return EC;
-  auto Type = AggregatedLBREntry::TRACE;
-  if (LLVM_LIKELY(TypeOrErr.get() == "T")) {
-  } else if (TypeOrErr.get() == "B") {
-    Type = AggregatedLBREntry::BRANCH;
-  } else if (TypeOrErr.get() == "F") {
-    Type = AggregatedLBREntry::FT;
-  } else if (TypeOrErr.get() == "f") {
-    Type = AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
-  } else {
+  enum TType { TRACE, BRANCH, FT, FT_EXTERNAL_ORIGIN, INVALID };
+  auto Type = StringSwitch<TType>(TypeOrErr.get())
+                  .Case("T", TRACE)
+                  .Case("B", BRANCH)
+                  .Case("F", FT)
+                  .Case("f", FT_EXTERNAL_ORIGIN)
+                  .Default(INVALID);
+  if (Type == INVALID) {
     reportError("expected T, B, F or f");
     return make_error_code(llvm::errc::io_error);
   }
@@ -1235,7 +1241,7 @@
     return EC;
 
   ErrorOr<Location> TraceFtEnd = std::error_code();
-  if (Type == AggregatedLBREntry::TRACE) {
+  if (Type == TRACE) {
     while (checkAndConsumeFS()) {
     }
     TraceFtEnd = parseLocationOrOffset();
@@ -1245,13 +1251,12 @@
 
   while (checkAndConsumeFS()) {
   }
-  ErrorOr<int64_t> Frequency =
-      parseNumberField(FieldSeparator, Type != AggregatedLBREntry::BRANCH);
+  ErrorOr<int64_t> Frequency = parseNumberField(FieldSeparator, Type != BRANCH);
   if (std::error_code EC = Frequency.getError())
     return EC;
 
   uint64_t Mispreds = 0;
-  if (Type == AggregatedLBREntry::BRANCH) {
+  if (Type == BRANCH) {
     while (checkAndConsumeFS()) {
     }
     ErrorOr<int64_t> MispredsOrErr = parseNumberField(FieldSeparator, true);
@@ -1273,13 +1278,28 @@
       BF->setHasProfileAvailable();
 
   uint64_t Count = static_cast<uint64_t>(Frequency.get());
-  AggregatedLBREntry Entry{From.get(), To.get(), Count, Mispreds, Type};
-  AggregatedLBRs.emplace_back(Entry);
-  if (Type == AggregatedLBREntry::TRACE) {
-    auto FtType = (FromFunc == ToFunc) ? AggregatedLBREntry::FT
-                                       : AggregatedLBREntry::FT_EXTERNAL_ORIGIN;
-    AggregatedLBREntry TraceFt{To.get(), TraceFtEnd.get(), Count, 0, FtType};
-    AggregatedLBRs.emplace_back(TraceFt);
+
+  Trace Trace(From->Offset, To->Offset);
+  // Taken trace
+  if (Type == TRACE || Type == BRANCH) {
+    TakenBranchInfo &Info = BranchLBRs[Trace];
+    Info.TakenCount += Count;
+    Info.MispredCount += Mispreds;
+
+    NumTotalSamples += Count;
+  }
+  // Construct fallthrough part of the trace
+  if (Type == TRACE) {
+    Trace.From = To->Offset;
+    Trace.To = TraceFtEnd->Offset;
+    Type = FromFunc == ToFunc ? FT : FT_EXTERNAL_ORIGIN;
+  }
+  // Add fallthrough trace
+  if (Type != BRANCH) {
+    FTInfo &Info = FallthroughLBRs[Trace];
+    (Type == FT ? Info.InternCount : Info.ExternCount) += Count;
+
+    NumTraces += Count;
   }
 
   return std::error_code();
@@ -1301,53 +1321,6 @@
   }
   Heatmap HM(opts::HeatmapBlock, opts::HeatmapMinAddress,
              opts::HeatmapMaxAddress, getTextSections(BC));
-  uint64_t NumTotalSamples = 0;
-
-  if (opts::BasicAggregation) {
-    while (hasData()) {
-      ErrorOr<PerfBasicSample> SampleRes = parseBasicSample();
-      if (std::error_code EC = SampleRes.getError()) {
-        if (EC == errc::no_such_process)
-          continue;
-        return EC;
-      }
-      PerfBasicSample &Sample = SampleRes.get();
-      HM.registerAddress(Sample.PC);
-      NumTotalSamples++;
-    }
-    outs() << "HEATMAP: read " << NumTotalSamples << " basic samples\n";
-  } else {
-    while (hasData()) {
-      ErrorOr<PerfBranchSample> SampleRes = parseBranchSample();
-      if (std::error_code EC = SampleRes.getError()) {
-        if (EC == errc::no_such_process)
-          continue;
-        return EC;
-      }
-
-      PerfBranchSample &Sample = SampleRes.get();
-
-      // LBRs are stored in reverse execution order. NextLBR refers to the next
-      // executed branch record.
-      const LBREntry *NextLBR = nullptr;
-      for (const LBREntry &LBR : Sample.LBR) {
-        if (NextLBR) {
-          // Record fall-through trace.
-          const uint64_t TraceFrom = LBR.To;
-          const uint64_t TraceTo = NextLBR->From;
-          ++FallthroughLBRs[Trace(TraceFrom, TraceTo)].InternCount;
-        }
-        NextLBR = &LBR;
-      }
-      if (!Sample.LBR.empty()) {
-        HM.registerAddress(Sample.LBR.front().To);
-        HM.registerAddress(Sample.LBR.back().From);
-      }
-      NumTotalSamples += Sample.LBR.size();
-    }
-    outs() << "HEATMAP: read " << NumTotalSamples << " LBR samples\n";
-    outs() << "HEATMAP: " << FallthroughLBRs.size() << " unique traces\n";
-  }
 
   if (!NumTotalSamples) {
     if (opts::BasicAggregation) {
@@ -1363,6 +1336,8 @@
 
   outs() << "HEATMAP: building heat map...\n";
 
+  for (const auto &[PC, Hits] : BasicSamples)
+    HM.registerAddress(PC, Hits);
   for (const auto &LBR : FallthroughLBRs) {
     const Trace &Trace = LBR.first;
     const FTInfo &Info = LBR.second;
@@ -1382,10 +1357,12 @@
     HM.printCDF(opts::OutputFilename);
   else
     HM.printCDF(opts::OutputFilename + ".csv");
+  Heatmap::SectionStatsMap Stats = HM.computeSectionStats();
   if (opts::OutputFilename == "-")
-    HM.printSectionHotness(opts::OutputFilename);
+    HM.printSectionHotness(Stats, opts::OutputFilename);
   else
-    HM.printSectionHotness(opts::OutputFilename + "-section-hotness.csv");
+    HM.printSectionHotness(Stats,
+                           opts::OutputFilename + "-section-hotness.csv");
 
   return std::error_code();
 }
@@ -1412,7 +1389,10 @@
       const uint64_t TraceTo = NextLBR->From;
       const BinaryFunction *TraceBF =
           getBinaryFunctionContainingAddress(TraceFrom);
-      if (TraceBF && TraceBF->containsAddress(TraceTo)) {
+      if (opts::HeatmapMode) {
+        FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
+        ++Info.InternCount;
+      } else if (TraceBF && TraceBF->containsAddress(TraceTo)) {
         FTInfo &Info = FallthroughLBRs[Trace(TraceFrom, TraceTo)];
         if (TraceBF->containsAddress(LBR.From))
           ++Info.InternCount;
@@ -1446,6 +1426,11 @@
     }
     NextLBR = &LBR;
 
+    if (opts::HeatmapMode) {
+      TakenBranchInfo &Info = BranchLBRs[Trace(LBR.From, LBR.To)];
+      ++Info.TakenCount;
+      continue;
+    }
     uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0;
     uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0;
     if (!From && !To)
@@ -1454,6 +1439,10 @@
     ++Info.TakenCount;
     Info.MispredCount += LBR.Mispred;
   }
+  if (opts::HeatmapMode && !Sample.LBR.empty()) {
+    ++BasicSamples[Sample.LBR.front().To];
+    ++BasicSamples[Sample.LBR.back().From];
+  }
 }
 
 void DataAggregator::printColdSamplesDiagnostic() const {
@@ -1589,7 +1578,6 @@
       printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
     }
   }
-  printBranchSamplesDiagnostics();
 
   return std::error_code();
 }
@@ -1617,6 +1605,7 @@
     const TakenBranchInfo &Info = AggrLBR.second;
     doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
   }
+  printBranchSamplesDiagnostics();
 }
 
 std::error_code DataAggregator::parseBasicEvents() {
@@ -1630,6 +1619,7 @@
 
     if (!Sample->PC)
       continue;
+    ++NumTotalSamples;
 
     if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
       BF->setHasProfileAvailable();
@@ -1637,6 +1627,7 @@
     ++BasicSamples[Sample->PC];
     EventNames.insert(Sample->EventName);
   }
+  outs() << "PERF2BOLT: read " << NumTotalSamples << " basic samples\n";
 
   return std::error_code();
 }
@@ -1649,7 +1640,6 @@
   for (auto &Sample : BasicSamples) {
     const uint64_t PC = Sample.first;
     const uint64_t HitCount = Sample.second;
-    NumTotalSamples += HitCount;
     BinaryFunction *Func = getBinaryFunctionContainingAddress(PC);
     if (!Func) {
       OutOfRangeSamples += HitCount;
@@ -1658,7 +1648,6 @@
 
     doSample(*Func, PC, HitCount);
   }
-  outs() << "PERF2BOLT: read " << NumTotalSamples << " samples\n";
 
   printBasicSamplesDiagnostics(OutOfRangeSamples);
 }
@@ -1722,49 +1711,6 @@
   }
 }
 
-std::error_code DataAggregator::parsePreAggregatedLBRSamples() {
-  outs() << "PERF2BOLT: parsing pre-aggregated profile...\n";
-  NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events",
-                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
-  while (hasData())
-    if (std::error_code EC = parseAggregatedLBREntry())
-      return EC;
-
-  return std::error_code();
-}
-
-void DataAggregator::processPreAggregated() {
-  outs() << "PERF2BOLT: processing pre-aggregated profile...\n";
-  NamedRegionTimer T("processAggregated", "Processing aggregated branch events",
-                     TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
-
-  for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) {
-    switch (AggrEntry.EntryType) {
-    case AggregatedLBREntry::BRANCH:
-    case AggregatedLBREntry::TRACE:
-      doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
-               AggrEntry.Mispreds);
-      NumTotalSamples += AggrEntry.Count;
-      break;
-    case AggregatedLBREntry::FT:
-    case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {
-      LBREntry First{AggrEntry.EntryType == AggregatedLBREntry::FT
-                         ? AggrEntry.From.Offset
-                         : 0,
-                     AggrEntry.From.Offset, false};
-      LBREntry Second{AggrEntry.To.Offset, AggrEntry.To.Offset, false};
-      doTrace(First, Second, AggrEntry.Count);
-      NumTraces += AggrEntry.Count;
-      break;
-    }
-    }
-  }
-
-  outs() << "PERF2BOLT: read " << AggregatedLBRs.size()
-         << " aggregated LBR entries\n";
-  printBranchSamplesDiagnostics();
-}
-
 std::optional<int32_t> DataAggregator::parseCommExecEvent() {
   size_t LineEnd = ParsingBuf.find_first_of("\n");
   if (LineEnd == StringRef::npos) {
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index f2e999b..4a92c9e 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -128,6 +128,13 @@
   return Result;
 }
 
+uint64_t FuncSampleData::getSamples() const {
+  uint64_t Result = 0;
+  for (const SampleInfo &I : Data)
+    Result += I.Hits;
+  return Result;
+}
+
 void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) {
   auto Iter = Index.find(Offset);
   if (Iter == Index.end()) {
@@ -407,12 +414,12 @@
   FuncBranchData *FBD = getBranchData(BF);
   if (FBD) {
     BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
-    BF.RawBranchCount = FBD->getNumExecutedBranches();
+    BF.RawSampleCount = FBD->getNumExecutedBranches();
     if (BF.ProfileMatchRatio == 1.0f) {
       if (fetchProfileForOtherEntryPoints(BF)) {
         BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
         BF.ExecutionCount = FBD->ExecutionCount;
-        BF.RawBranchCount = FBD->getNumExecutedBranches();
+        BF.RawSampleCount = FBD->getNumExecutedBranches();
       }
       return;
     }
diff --git a/bolt/lib/Profile/Heatmap.cpp b/bolt/lib/Profile/Heatmap.cpp
index 5fc3e06..067c96c 100644
--- a/bolt/lib/Profile/Heatmap.cpp
+++ b/bolt/lib/Profile/Heatmap.cpp
@@ -284,64 +284,92 @@
   Counts.clear();
 }
 
-void Heatmap::printSectionHotness(StringRef FileName) const {
+void Heatmap::printSectionHotness(const Heatmap::SectionStatsMap &Stats,
+                                  StringRef FileName) const {
   std::error_code EC;
   raw_fd_ostream OS(FileName, EC, sys::fs::OpenFlags::OF_None);
   if (EC) {
     errs() << "error opening output file: " << EC.message() << '\n';
     exit(1);
   }
-  printSectionHotness(OS);
+  printSectionHotness(Stats, OS);
 }
 
-void Heatmap::printSectionHotness(raw_ostream &OS) const {
+StringMap<Heatmap::SectionStats> Heatmap::computeSectionStats() const {
   uint64_t NumTotalCounts = 0;
-  StringMap<uint64_t> SectionHotness;
+  StringMap<SectionStats> Stat;
   unsigned TextSectionIndex = 0;
 
   if (TextSections.empty())
-    return;
+    return Stat;
 
   uint64_t UnmappedHotness = 0;
   auto RecordUnmappedBucket = [&](uint64_t Address, uint64_t Frequency) {
-    errs() << "Couldn't map the address bucket [0x" << Twine::utohexstr(Address)
-           << ", 0x" << Twine::utohexstr(Address + BucketSize)
-           << "] containing " << Frequency
-           << " samples to a text section in the binary.";
+    if (opts::Verbosity >= 1)
+      errs() << "Couldn't map the address bucket ["
+             << formatv("{0:x}, {1:x}", Address, Address + BucketSize)
+             << "] containing " << Frequency
+             << " samples to a text section in the binary.\n";
     UnmappedHotness += Frequency;
   };
 
-  for (const std::pair<const uint64_t, uint64_t> &KV : Map) {
-    NumTotalCounts += KV.second;
+  for (const auto [Bucket, Count] : Map) {
+    NumTotalCounts += Count;
     // We map an address bucket to the first section (lowest address)
     // overlapping with that bucket.
-    auto Address = KV.first * BucketSize;
+    auto Address = Bucket * BucketSize;
     while (TextSectionIndex < TextSections.size() &&
            Address >= TextSections[TextSectionIndex].EndAddress)
       TextSectionIndex++;
     if (TextSectionIndex >= TextSections.size() ||
         Address + BucketSize < TextSections[TextSectionIndex].BeginAddress) {
-      RecordUnmappedBucket(Address, KV.second);
+      RecordUnmappedBucket(Address, Count);
       continue;
     }
-    SectionHotness[TextSections[TextSectionIndex].Name] += KV.second;
+    SectionStats &SecStats = Stat[TextSections[TextSectionIndex].Name];
+    ++SecStats.Buckets;
+    SecStats.Samples += Count;
   }
+  Stat["[total]"] = SectionStats{NumTotalCounts, Map.size()};
+  if (UnmappedHotness)
+    Stat["[unmapped]"] = SectionStats{UnmappedHotness, 0};
 
+  return Stat;
+}
+
+void Heatmap::printSectionHotness(const StringMap<SectionStats> &Stats,
+                                  raw_ostream &OS) const {
+  if (TextSections.empty())
+    return;
+
+  auto TotalIt = Stats.find("[total]");
+  assert(TotalIt != Stats.end() && "Malformed SectionStatsMap");
+  const uint64_t NumTotalCounts = TotalIt->second.Samples;
   assert(NumTotalCounts > 0 &&
          "total number of heatmap buckets should be greater than 0");
 
-  OS << "Section Name, Begin Address, End Address, Percentage Hotness\n";
-  for (auto &TextSection : TextSections) {
-    OS << TextSection.Name << ", 0x"
-       << Twine::utohexstr(TextSection.BeginAddress) << ", 0x"
-       << Twine::utohexstr(TextSection.EndAddress) << ", "
-       << format("%.4f",
-                 100.0 * SectionHotness[TextSection.Name] / NumTotalCounts)
-       << "\n";
+  OS << "Section Name, Begin Address, End Address, Percentage Hotness, "
+     << "Utilization Pct\n";
+  for (const auto [Name, Begin, End] : TextSections) {
+    uint64_t Samples = 0;
+    uint64_t Buckets = 0;
+    auto SectionIt = Stats.find(Name);
+    if (SectionIt != Stats.end()) {
+      Samples = SectionIt->second.Samples;
+      Buckets = SectionIt->second.Buckets;
+    }
+    const float RelHotness = 100. * Samples / NumTotalCounts;
+    const unsigned NumBuckets =
+        End / BucketSize + !!(End % BucketSize) - Begin / BucketSize;
+    const float BucketUtilization = 100. * Buckets / NumBuckets;
+    OS << formatv("{0}, {1:x}, {2:x}, {3:f4}, {4:f4}\n", Name, Begin, End,
+                  RelHotness, BucketUtilization);
   }
-  if (UnmappedHotness > 0)
-    OS << "[unmapped], 0x0, 0x0, "
-       << format("%.4f", 100.0 * UnmappedHotness / NumTotalCounts) << "\n";
+  auto UnmappedIt = Stats.find("[unmapped]");
+  if (UnmappedIt == Stats.end())
+    return;
+  const float UnmappedPct = 100. * UnmappedIt->second.Samples / NumTotalCounts;
+  OS << formatv("[unmapped], 0x0, 0x0, {0:f4}, 0\n", UnmappedPct);
 }
 } // namespace bolt
 } // namespace llvm
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index f5636bf..88b806c 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -177,11 +177,11 @@
 
   BF.setExecutionCount(YamlBF.ExecCount);
 
-  uint64_t FuncRawBranchCount = 0;
+  uint64_t FuncRawSampleCount = 0;
   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks)
     for (const yaml::bolt::SuccessorInfo &YamlSI : YamlBB.Successors)
-      FuncRawBranchCount += YamlSI.Count;
-  BF.setRawBranchCount(FuncRawBranchCount);
+      FuncRawSampleCount += YamlSI.Count;
+  BF.setRawSampleCount(FuncRawSampleCount);
 
   if (BF.empty())
     return true;
diff --git a/bolt/test/X86/heatmap-preagg.test b/bolt/test/X86/heatmap-preagg.test
new file mode 100644
index 0000000..660d37f
--- /dev/null
+++ b/bolt/test/X86/heatmap-preagg.test
@@ -0,0 +1,35 @@
+## Test heatmap with pre-aggregated profile
+
+RUN: yaml2obj %p/Inputs/blarge_new.yaml &> %t.exe
+## Non-BOLTed input binary
+RUN: llvm-bolt-heatmap %t.exe -o %t --pa -p %p/Inputs/blarge_new.preagg.txt \
+RUN:   2>&1 | FileCheck --check-prefix CHECK-HEATMAP %s
+RUN: FileCheck %s --check-prefix CHECK-SEC-HOT --input-file %t-section-hotness.csv
+
+## BOLTed input binary
+RUN: llvm-bolt %t.exe -o %t.out --pa -p %p/Inputs/blarge_new.preagg.txt \
+RUN:   --reorder-blocks=ext-tsp --split-functions --split-strategy=cdsplit \
+RUN:   --reorder-functions=cdsort --enable-bat --dyno-stats --skip-funcs=main
+RUN: llvm-bolt-heatmap %t.out -o %t2 --pa -p %p/Inputs/blarge_new_bat.preagg.txt \
+RUN:   2>&1 | FileCheck --check-prefix CHECK-HEATMAP-BAT %s
+RUN: FileCheck %s --check-prefix CHECK-SEC-HOT-BAT --input-file %t2-section-hotness.csv
+
+CHECK-HEATMAP: PERF2BOLT: read 81 aggregated LBR entries
+CHECK-HEATMAP: HEATMAP: invalid traces: 1
+
+CHECK-SEC-HOT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct
+CHECK-SEC-HOT-NEXT: .init, 0x401000, 0x40101b, 16.8545, 100.0000
+CHECK-SEC-HOT-NEXT: .plt, 0x401020, 0x4010b0, 4.7583, 66.6667
+CHECK-SEC-HOT-NEXT: .text, 0x4010b0, 0x401c25, 78.3872, 85.1064
+CHECK-SEC-HOT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000
+
+CHECK-HEATMAP-BAT: PERF2BOLT: read 79 aggregated LBR entries
+CHECK-HEATMAP-BAT: HEATMAP: invalid traces: 2
+
+CHECK-SEC-HOT-BAT: Section Name, Begin Address, End Address, Percentage Hotness, Utilization Pct
+CHECK-SEC-HOT-BAT-NEXT: .init, 0x401000, 0x40101b, 17.2888, 100.0000
+CHECK-SEC-HOT-BAT-NEXT: .plt, 0x401020, 0x4010b0, 5.6132, 66.6667
+CHECK-SEC-HOT-BAT-NEXT: .bolt.org.text, 0x4010b0, 0x401c25, 38.3385
+CHECK-SEC-HOT-BAT-NEXT: .fini, 0x401c28, 0x401c35, 0.0000, 0.0000
+CHECK-SEC-HOT-BAT-NEXT: .text, 0x800000, 0x8002cc, 38.7595, 91.6667
+CHECK-SEC-HOT-BAT-NEXT: .text.cold, 0x800300, 0x800415, 0.0000, 0.0000
diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test
index 7bec442..44111de 100644
--- a/bolt/test/perf2bolt/perf_test.test
+++ b/bolt/test/perf2bolt/perf_test.test
@@ -8,6 +8,7 @@
 
 CHECK-NOT: PERF2BOLT-ERROR
 CHECK-NOT: !! WARNING !! This high mismatch ratio indicates the input binary is probably not the same binary used during profiling collection.
+CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total sample counts.
 
 RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4
 RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4