Revert "[flang][cuda] Update stream type for cuf kernel op (#136627)"

This reverts commit 46e734746db7176f6e32b3c98beacf1e94fced37.
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index 46cc59c..926983d 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -254,19 +254,24 @@
     represented by a 0 constant value.
   }];
 
-  let arguments = (ins Variadic<I32>:$grid, // empty means `*`
-      Variadic<I32>:$block,                 // empty means `*`
-      Optional<fir_ReferenceType>:$stream, Variadic<Index>:$lowerbound,
-      Variadic<Index>:$upperbound, Variadic<Index>:$step,
-      OptionalAttr<I64Attr>:$n, Variadic<AnyType>:$reduceOperands,
-      OptionalAttr<ArrayAttr>:$reduceAttrs);
+  let arguments = (ins
+    Variadic<I32>:$grid, // empty means `*`
+    Variadic<I32>:$block, // empty means `*`
+    Optional<I32>:$stream,
+    Variadic<Index>:$lowerbound,
+    Variadic<Index>:$upperbound,
+    Variadic<Index>:$step,
+    OptionalAttr<I64Attr>:$n,
+    Variadic<AnyType>:$reduceOperands,
+    OptionalAttr<ArrayAttr>:$reduceAttrs
+  );
 
   let regions = (region AnyRegion:$region);
 
   let assemblyFormat = [{
     `<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,` 
                 custom<CUFKernelValues>($block, type($block))
-        ( `,` `stream` `=` $stream^ `:` qualified(type($stream)))? `>` `>` `>`
+        ( `,` `stream` `=` $stream^ )? `>` `>` `>`
         ( `reduce` `(` $reduceOperands^ `:` type($reduceOperands) `:` $reduceAttrs `)` )?
         custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
             $upperbound, type($upperbound), $step, type($step))
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 7b76845..1652a86 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3097,7 +3097,7 @@
 
     llvm::SmallVector<mlir::Value> gridValues;
     llvm::SmallVector<mlir::Value> blockValues;
-    mlir::Value streamAddr;
+    mlir::Value streamValue;
 
     if (launchConfig) {
       const std::list<Fortran::parser::CUFKernelDoConstruct::StarOrExpr> &grid =
@@ -3130,8 +3130,10 @@
       }
 
       if (stream)
-        streamAddr = fir::getBase(
-            genExprAddr(*Fortran::semantics::GetExpr(*stream), stmtCtx));
+        streamValue = builder->createConvert(
+            loc, builder->getI32Type(),
+            fir::getBase(
+                genExprValue(*Fortran::semantics::GetExpr(*stream), stmtCtx)));
     }
 
     const auto &outerDoConstruct =
@@ -3265,7 +3267,7 @@
     }
 
     auto op = builder->create<cuf::KernelOp>(
-        loc, gridValues, blockValues, streamAddr, lbs, ubs, steps, n,
+        loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n,
         mlir::ValueRange(reduceOperands), builder->getArrayAttr(reduceAttrs));
     builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
                          ivLocs);
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 24033bc..a86f12c 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -271,7 +271,7 @@
         return emitOpError("expect reduce attributes to be ReduceAttr");
     }
   }
-  return checkStreamType(*this);
+  return mlir::success();
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index 10f0b9e..0fceb29 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -75,7 +75,9 @@
   end do
 end
 
-! CHECK: cuf.kernel<<<*, *, stream = %[[STREAM]]#0 : !fir.ref<i64>>>>
+! CHECK: %[[STREAM_LOAD:.*]] = fir.load %[[STREAM]]#0 : !fir.ref<i64>
+! CHECK: %[[STREAM_I32:.*]] = fir.convert %[[STREAM_LOAD]] : (i64) -> i32
+! CHECK: cuf.kernel<<<*, *, stream = %[[STREAM_I32]]>>>
 
 
 ! Test lowering with unstructured construct inside.