[RFC][AMDGPU] Add AMDGPU_SUMMARY bitcode block for ThinLTO

With AMDGPU object linking, device functions are compiled separately from the
kernels that call them. Without whole-program visibility, the compiler must be
conservative about occupancy for every device function, leading to suboptimal
resource usage. However, GPU kernels typically carry explicit occupancy control
attributes that constrain the launch environment. ThinLTO is the natural place
to propagate these kernel attributes to callees: the combined module summary
index contains a cross-TU call graph, allowing occupancy information to be
propagated top-down from kernels to all reachable device functions. The backend
can then generate better code with the propagated constraints, achieving
whole-program awareness without the compile-time overhead of full LTO.

This patch introduces a dedicated AMDGPU_SUMMARY bitcode block that serializes
per-function summary data alongside the standard module summary. The block is
scoped to AMDGPU so that non-AMDGPU targets are completely unaffected. A
follow-up patch will add the ThinLTO propagation logic that reads these
summaries and applies conservative attribute bounds to device functions
reachable from multiple kernels.
diff --git a/llvm/include/llvm/Bitcode/BitcodeReader.h b/llvm/include/llvm/Bitcode/BitcodeReader.h
index 772ca82..7ff6f7d 100644
--- a/llvm/include/llvm/Bitcode/BitcodeReader.h
+++ b/llvm/include/llvm/Bitcode/BitcodeReader.h
@@ -17,6 +17,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Bitstream/BitCodeEnums.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -166,6 +167,10 @@
     LLVM_ABI Error
     readSummary(ModuleSummaryIndex &CombinedIndex, StringRef ModulePath,
                 std::function<bool(GlobalValue::GUID)> IsPrevailing = nullptr);
+
+    /// Read the AMDGPU_SUMMARY block (if present) and merge per-function
+    /// occupancy data into \p Summaries. Returns false if no block was found.
+    LLVM_ABI Expected<bool> readAMDGPUSummary(AMDGPU::SummaryMap &Summaries);
   };
 
   struct BitcodeFileContents {
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 9162754b..e543966 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -63,6 +63,8 @@
   SYMTAB_BLOCK_ID,
 
   SYNC_SCOPE_NAMES_BLOCK_ID,
+
+  AMDGPU_SUMMARY_BLOCK_ID,
 };
 
 /// Identification block contains a string that describes the producer details,
@@ -834,6 +836,14 @@
   SYMTAB_BLOB = 1,
 };
 
+enum AMDGPUSummaryCodes {
+  // [version]
+  AMDGPU_SUMMARY_VERSION = 1,
+  // [guid, is_entry, flat_wg_min, flat_wg_max,
+  //  waves_min, waves_max, max_wg_x, max_wg_y, max_wg_z]
+  AMDGPU_SUMMARY_ENTRY = 2,
+};
+
 } // End bitc namespace
 } // End llvm namespace
 
diff --git a/llvm/include/llvm/Support/AMDGPUSummary.h b/llvm/include/llvm/Support/AMDGPUSummary.h
new file mode 100644
index 0000000..8bade6d
--- /dev/null
+++ b/llvm/include/llvm/Support/AMDGPUSummary.h
@@ -0,0 +1,46 @@
+//===- AMDGPUSummary.h - AMDGPU ThinLTO summary data ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Per-function AMDGPU summary information carried through ThinLTO for
+// cross-TU attribute propagation. Stored in the AMDGPU_SUMMARY bitcode
+// block, separate from the standard module summary, so that non-AMDGPU
+// targets are completely unaffected.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_AMDGPUSUMMARY_H
+#define LLVM_SUPPORT_AMDGPUSUMMARY_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/GlobalValue.h"
+#include <cstdint>
+#include <limits>
+
+namespace llvm {
+namespace AMDGPU {
+
+struct FunctionSummary {
+  bool IsEntry = false;
+
+  uint32_t FlatWGSizeMin = 1;
+  uint32_t FlatWGSizeMax = 1024;
+
+  uint32_t WavesPerEUMin = 1;
+  uint32_t WavesPerEUMax = 10;
+
+  uint32_t MaxNumWGX = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGY = std::numeric_limits<uint32_t>::max();
+  uint32_t MaxNumWGZ = std::numeric_limits<uint32_t>::max();
+};
+
+using SummaryMap = DenseMap<GlobalValue::GUID, FunctionSummary>;
+
+} // namespace AMDGPU
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_AMDGPUSUMMARY_H
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index 911ec75..9a164e0 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -81,6 +81,8 @@
     return "STRTAB_BLOCK";
   case bitc::SYMTAB_BLOCK_ID:
     return "SYMTAB_BLOCK";
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    return "AMDGPU_SUMMARY_BLOCK";
   }
 }
 
@@ -420,6 +422,15 @@
     case bitc::SYMTAB_BLOB:
       return "BLOB";
     }
+  case bitc::AMDGPU_SUMMARY_BLOCK_ID:
+    switch (CodeID) {
+    default:
+      return std::nullopt;
+    case bitc::AMDGPU_SUMMARY_VERSION:
+      return "AMDGPU_SUMMARY_VERSION";
+    case bitc::AMDGPU_SUMMARY_ENTRY:
+      return "AMDGPU_SUMMARY_ENTRY";
+    }
   }
 #undef STRINGIFY_CODE
 }
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fa7a3b2..ddae213 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -8695,6 +8695,88 @@
   return R.parseModule();
 }
 
+Expected<bool> BitcodeModule::readAMDGPUSummary(AMDGPU::SummaryMap &Summaries) {
+  BitstreamCursor Stream(Buffer);
+  if (Error JumpFailed = Stream.JumpToBit(ModuleBit))
+    return std::move(JumpFailed);
+
+  if (Error Err = Stream.EnterSubBlock(bitc::MODULE_BLOCK_ID))
+    return std::move(Err);
+
+  // Scan sub-blocks to find AMDGPU_SUMMARY_BLOCK_ID.
+  while (true) {
+    Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
+    if (!MaybeEntry)
+      return MaybeEntry.takeError();
+    BitstreamEntry Entry = MaybeEntry.get();
+
+    switch (Entry.Kind) {
+    case BitstreamEntry::Error:
+      return error("Malformed block");
+    case BitstreamEntry::EndBlock:
+      return false;
+    case BitstreamEntry::Record:
+      if (Expected<unsigned> Skipped = Stream.skipRecord(Entry.ID); !Skipped)
+        return Skipped.takeError();
+      continue;
+    case BitstreamEntry::SubBlock:
+      if (Entry.ID == bitc::AMDGPU_SUMMARY_BLOCK_ID)
+        break;
+      if (Error Err = Stream.SkipBlock())
+        return std::move(Err);
+      continue;
+    }
+
+    // Found AMDGPU_SUMMARY_BLOCK_ID — parse it.
+    if (Error Err = Stream.EnterSubBlock(bitc::AMDGPU_SUMMARY_BLOCK_ID))
+      return std::move(Err);
+
+    SmallVector<uint64_t, 16> Record;
+    while (true) {
+      Expected<BitstreamEntry> MaybeRec = Stream.advanceSkippingSubblocks();
+      if (!MaybeRec)
+        return MaybeRec.takeError();
+      BitstreamEntry Rec = MaybeRec.get();
+
+      if (Rec.Kind == BitstreamEntry::EndBlock)
+        return true;
+      if (Rec.Kind != BitstreamEntry::Record)
+        return error("Expected record in AMDGPU_SUMMARY block");
+
+      Record.clear();
+      Expected<unsigned> MaybeCode = Stream.readRecord(Rec.ID, Record);
+      if (!MaybeCode)
+        return MaybeCode.takeError();
+
+      switch (MaybeCode.get()) {
+      case bitc::AMDGPU_SUMMARY_VERSION:
+        if (Record.size() < 1 || Record[0] != 1)
+          return error("Unsupported AMDGPU summary version");
+        break;
+      case bitc::AMDGPU_SUMMARY_ENTRY: {
+        if (Record.size() < 9)
+          return error("Invalid AMDGPU summary entry");
+
+        GlobalValue::GUID GUID = Record[0];
+        AMDGPU::FunctionSummary FS;
+        FS.IsEntry = Record[1] != 0;
+        FS.FlatWGSizeMin = Record[2];
+        FS.FlatWGSizeMax = Record[3];
+        FS.WavesPerEUMin = Record[4];
+        FS.WavesPerEUMax = Record[5];
+        FS.MaxNumWGX = Record[6];
+        FS.MaxNumWGY = Record[7];
+        FS.MaxNumWGZ = Record[8];
+        Summaries[GUID] = FS;
+        break;
+      }
+      default:
+        break;
+      }
+    }
+  }
+}
+
 // Parse the specified bitcode buffer, returning the function info index.
 Expected<std::unique_ptr<ModuleSummaryIndex>> BitcodeModule::getSummary() {
   BitstreamCursor Stream(Buffer);
@@ -8725,7 +8807,7 @@
     switch (Entry.Kind) {
     case BitstreamEntry::SubBlock: // Handled for us already.
     case BitstreamEntry::Error:
-      return error("Malformed block");
+      return error("malformed block");
     case BitstreamEntry::EndBlock: {
       // If no flags record found, return both flags as false.
       return std::make_pair(false, false);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index 7153b1a..84de4e7 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -64,6 +64,7 @@
 #include "llvm/Object/IRSymtab.h"
 #include "llvm/ProfileData/MemProf.h"
 #include "llvm/ProfileData/MemProfRadixTree.h"
+#include "llvm/Support/AMDGPUSummary.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -450,6 +451,7 @@
                 DenseMap<const Function *, uint64_t> &FunctionToBitcodeIndex);
   void writeBlockInfo();
   void writeModuleHash(StringRef View);
+  void writeAMDGPUSummaryBlock();
 
   unsigned getEncodedSyncScopeID(SyncScope::ID SSID) {
     return unsigned(SSID);
@@ -5360,6 +5362,88 @@
   }
 }
 
+void ModuleBitcodeWriter::writeAMDGPUSummaryBlock() {
+  Triple TT(M.getTargetTriple());
+  // Object linking is only supported on AMDHSA platforms.
+  if (TT.getArch() != Triple::amdgcn || TT.getOS() != Triple::AMDHSA)
+    return;
+
+  SmallVector<const Function *, 8> Worklist;
+  for (const Function &F : M) {
+    if (F.isDeclaration())
+      continue;
+    if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+        F.getCallingConv() == CallingConv::SPIR_KERNEL ||
+        F.hasFnAttribute("amdgpu-flat-work-group-size") ||
+        F.hasFnAttribute("amdgpu-waves-per-eu") ||
+        F.hasFnAttribute("amdgpu-max-num-workgroups"))
+      Worklist.push_back(&F);
+  }
+  if (Worklist.empty())
+    return;
+
+  Stream.EnterSubblock(bitc::AMDGPU_SUMMARY_BLOCK_ID, 4);
+
+  SmallVector<uint64_t, 10> Record;
+  Record.push_back(1);
+  Stream.EmitRecord(bitc::AMDGPU_SUMMARY_VERSION, Record);
+
+  for (const Function *F : Worklist) {
+    bool IsEntry = (F->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+                    F->getCallingConv() == CallingConv::SPIR_KERNEL);
+
+    AMDGPU::FunctionSummary FS;
+    FS.IsEntry = IsEntry;
+
+    if (Attribute A = F->getFnAttribute("amdgpu-flat-work-group-size");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min, Max;
+      if (!MinS.trim().getAsInteger(0, Min) &&
+          !MaxS.trim().getAsInteger(0, Max)) {
+        FS.FlatWGSizeMin = Min;
+        FS.FlatWGSizeMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-waves-per-eu");
+        A.isStringAttribute()) {
+      auto [MinS, MaxS] = A.getValueAsString().split(',');
+      unsigned Min;
+      if (!MinS.trim().getAsInteger(0, Min)) {
+        FS.WavesPerEUMin = Min;
+        unsigned Max;
+        if (!MaxS.trim().empty() && !MaxS.trim().getAsInteger(0, Max))
+          FS.WavesPerEUMax = Max;
+      }
+    }
+
+    if (Attribute A = F->getFnAttribute("amdgpu-max-num-workgroups");
+        A.isStringAttribute()) {
+      SmallVector<StringRef, 3> Parts;
+      A.getValueAsString().split(Parts, ',');
+      if (Parts.size() == 3) {
+        unsigned X, Y, Z;
+        if (!Parts[0].trim().getAsInteger(0, X) &&
+            !Parts[1].trim().getAsInteger(0, Y) &&
+            !Parts[2].trim().getAsInteger(0, Z)) {
+          FS.MaxNumWGX = X;
+          FS.MaxNumWGY = Y;
+          FS.MaxNumWGZ = Z;
+        }
+      }
+    }
+
+    Record.clear();
+    Record = {F->getGUID(),     FS.IsEntry,       FS.FlatWGSizeMin,
+              FS.FlatWGSizeMax, FS.WavesPerEUMin, FS.WavesPerEUMax,
+              FS.MaxNumWGX,     FS.MaxNumWGY,     FS.MaxNumWGZ};
+    Stream.EmitRecord(bitc::AMDGPU_SUMMARY_ENTRY, Record);
+  }
+
+  Stream.ExitBlock();
+}
+
 void ModuleBitcodeWriter::write() {
   writeIdentificationBlock(Stream);
 
@@ -5415,6 +5499,8 @@
   if (Index)
     writePerModuleGlobalValueSummary();
 
+  writeAMDGPUSummaryBlock();
+
   writeGlobalValueSymbolTable(FunctionToBitcodeIndex);
 
   writeModuleHash(Stream.getMarkedBufferAndResumeFlushing());
@@ -5613,7 +5699,7 @@
     Writer.writeSymtab();
     Writer.writeStrtab();
   };
-  Triple TT(M.getTargetTriple());
+  const Triple &TT = M.getTargetTriple();
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO()) {
     // If this is darwin or another generic macho target, reserve space for the
     // header. Note that the header is computed *after* the output is known, so
diff --git a/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
new file mode 100644
index 0000000..f31a484
--- /dev/null
+++ b/llvm/test/ThinLTO/AMDGPU/amdgpu-summary-roundtrip.ll
@@ -0,0 +1,47 @@
+; Verify that the AMDGPU_SUMMARY block round-trips through bitcode.
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -module-summary %s -o %t.bc
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck %s --check-prefix=BLOCK
+
+; All attributes present.
+; BLOCK: <AMDGPU_SUMMARY_BLOCK
+; BLOCK-NEXT: <AMDGPU_SUMMARY_VERSION op0=1/>
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=64 op3=256 op4=2 op5=8 op6=16 op7=16 op8=1/>
+
+; Only flat-work-group-size — waves and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=128 op3=512 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Only waves-per-eu — flat-work-group-size and max-workgroups use defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=4 op5=6 op6=4294967295 op7=4294967295 op8=4294967295/>
+
+; Bare kernel with no attributes — all defaults.
+; BLOCK-NEXT: <AMDGPU_SUMMARY_ENTRY {{.*}} op1=1 op2=1 op3=1024 op4=1 op5=10 op6=4294967295 op7=4294967295 op8=4294967295/>
+; BLOCK-NEXT: </AMDGPU_SUMMARY_BLOCK>
+
+define amdgpu_kernel void @kernel_all(ptr %p) #0 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_wg_only(ptr %p) #1 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_waves_only(ptr %p) #2 {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define amdgpu_kernel void @kernel_bare(ptr %p) {
+  call void @device_func(ptr %p)
+  ret void
+}
+
+define void @device_func(ptr %p) {
+  store i32 42, ptr %p
+  ret void
+}
+
+attributes #0 = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-waves-per-eu"="2,8" "amdgpu-max-num-workgroups"="16,16,1" }
+attributes #1 = { "amdgpu-flat-work-group-size"="128,512" }
+attributes #2 = { "amdgpu-waves-per-eu"="4,6" }