| // NOTE: this test requires gpu-sm80 and cusparselt |
| // |
| // DEFINE: %{compile} = mlir-opt %s \ |
| // DEFINE: --sparsifier="enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71 gpu-format=%gpu_compilation_format |
| // DEFINE: %{run} = mlir-cpu-runner \ |
| // DEFINE: --shared-libs=%mlir_cuda_runtime \ |
| // DEFINE: --shared-libs=%mlir_c_runner_utils \ |
| // DEFINE: --e main --entry-point-result=void \ |
| // DEFINE: | FileCheck %s |
| // |
| // with RT lib: |
| // |
| // RUN: %{compile} enable-runtime-library=true" | %{run} |
| // |
| // without RT lib: |
| // |
| // RUN: %{compile} enable-runtime-library=false" | %{run} |
| |
| #NV_24 = #sparse_tensor.encoding<{ |
| map = ( i, j ) -> |
| ( i : dense, |
| j floordiv 4 : dense, |
| j mod 4 : structured[2, 4] |
| ) |
| }> |
| |
| module { |
| |
| llvm.func @mgpuCreateSparseLtEnv() |
| llvm.func @mgpuDestroySparseLtEnv() |
| |
| func.func @matmul24(%Ad: tensor<16x16xf16>, |
| %B: tensor<16x16xf16>, |
| %Cin: tensor<16x16xf16>) -> tensor<16x16xf16> { |
| %A = sparse_tensor.convert %Ad : tensor<16x16xf16> to tensor<16x16xf16, #NV_24> |
| %C = linalg.matmul |
| ins(%A, %B: tensor<16x16xf16, #NV_24>, tensor<16x16xf16>) |
| outs(%Cin: tensor<16x16xf16>) -> tensor<16x16xf16> |
| return %C : tensor<16x16xf16> |
| } |
| |
| func.func @main() { |
| llvm.call @mgpuCreateSparseLtEnv() : () -> () |
| |
| %c0 = arith.constant 0 : index |
| %c1 = arith.constant 1 : index |
| %c16 = arith.constant 16 : index |
| |
| %f0 = arith.constant 0.0 : f16 |
| %f1 = arith.constant 1.0 : f16 |
| %f4 = arith.constant 4.0 : f16 |
| |
| // Initial A, B, C matrices. |
| %A = tensor.generate { |
| ^bb0(%i: index, %j: index): |
| %val = arith.andi %j, %c1 : index |
| %cmp = arith.cmpi eq, %val, %c0 : index |
| %res = arith.select %cmp, %f4, %f1 : f16 |
| tensor.yield %res : f16 |
| } : tensor<16x16xf16> |
| %B = tensor.generate { |
| ^bb0(%i: index, %j: index): |
| %cmp = arith.cmpi eq, %i, %j : index |
| %res = arith.select %cmp, %f1, %f0 : f16 |
| tensor.yield %res : f16 |
| } : tensor<16x16xf16> |
| %C = tensor.generate { |
| ^bb0(%i: index, %j: index): |
| tensor.yield %f0 : f16 |
| } : tensor<16x16xf16> |
| |
| // Call the kernel. |
| // |
| // By effectively computing D = A B + C with id(B) and zero(C) |
| // the resulting matrix returns the pruned A back to the caller. |
| // |
| %D = call @matmul24(%A, %B, %C): (tensor<16x16xf16>, |
| tensor<16x16xf16>, |
| tensor<16x16xf16>) -> (tensor<16x16xf16>) |
| |
| // |
| // This was the original matrix. |
| // |
| // CHECK: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // CHECK-NEXT: ( 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1, 4, 1 ) |
| // |
| scf.for %i = %c0 to %c16 step %c1 { |
| %va = vector.transfer_read %A[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16> |
| vector.print %va : vector<16xf16> |
| } |
| |
| // |
| // This is the STRIP-pruned matrix. |
| // |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // CHECK-NEXT: ( 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0, 4, 0 ) |
| // |
| scf.for %i = %c0 to %c16 step %c1 { |
| %vd = vector.transfer_read %D[%i, %c0], %f0 : tensor<16x16xf16>, vector<16xf16> |
| vector.print %vd : vector<16xf16> |
| } |
| |
| llvm.call @mgpuDestroySparseLtEnv() : () -> () |
| return |
| } |
| } |