blob: b24967d5a130ed1ef513018fae9ad35d32dec205 [file] [log] [blame]
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-SD-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-SD-DOT
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BASE,CHECK-GI-BASE
; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-GI-DOT
; CHECK-GI-BASE: warning: Instruction selection used fallback path for full
define i32 @addv_v2i32(<2 x i32> %a) {
; CHECK-LABEL: addv_v2i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
ret i32 %arg1
}
define i16 @addv_v4i16(<4 x i16> %a) {
; CHECK-LABEL: addv_v4i16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv h0, v0.4h
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
ret i16 %arg1
}
define i32 @add_v4i32_v4i32(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
ret i32 %z
}
define i8 @addv_v8i8(<8 x i8> %a) {
; CHECK-LABEL: addv_v8i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv b0, v0.8b
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
ret i8 %arg1
}
define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i16:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i16:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i16:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i16:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
ret i16 %z
}
define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_zext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_sext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #24
; CHECK-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w0, s0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v0.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v0.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: uxtb w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: uxtb w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
ret i8 %z
}
define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v1.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v1.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x0, d0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x0, d0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define i64 @add_v2i64_v2i64(<2 x i64> %x) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x0, d0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
ret i64 %z
}
define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%r = add i32 %z, %a
ret i32 %r
}
define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv s0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i16_acc:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i16_acc:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%r = add i16 %z, %a
ret i16 %r
}
define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v1.2d, v0.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #48
; CHECK-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w8, s0
; CHECK-DOT-NEXT: add w0, w8, w0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v1.16b, #1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v0.16b, v1.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w8, s0
; CHECK-DOT-NEXT: add w0, w8, w0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v2.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v3.4s, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: udot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlv s0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v2.8b, #1
; CHECK-SD-DOT-NEXT: sdot v1.2s, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v1.4s, v0.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.8b, #1
; CHECK-GI-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v2.2s, v0.8b, v1.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w0, w8, w0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w0, w8, w0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w0, w8, w0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w0, w8, w0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-NEXT: shl v0.4s, v0.4s, #24
; CHECK-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-NEXT: addv s0, v0.4s
; CHECK-NEXT: fmov w8, s0
; CHECK-NEXT: add w0, w8, w0
; CHECK-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlv h0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlv h0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlv h0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: sxth w0, w8
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlv h0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: sxth w0, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v1.8h, v0.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xffff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xffff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: sxth w0, w8
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: sxth w0, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w8, s0
; CHECK-SD-BASE-NEXT: add w8, w8, w0
; CHECK-SD-BASE-NEXT: and w0, w8, #0xff
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w8, s0
; CHECK-SD-DOT-NEXT: add w8, w8, w0
; CHECK-SD-DOT-NEXT: and w0, w8, #0xff
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-BASE-NEXT: and w0, w8, #0xff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: add w8, w0, w8, uxtb
; CHECK-GI-DOT-NEXT: and w0, w8, #0xff
; CHECK-GI-DOT-NEXT: ret
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
ret i8 %r
}
define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v1.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v3.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v3.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v1.2d, v5.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v1.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v5.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v7.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v1.2d, v2.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v1.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v2.2d, v1.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v3.2d, v0.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v1.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlv d0, v0.4s
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlv d0, v0.4s
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: and v2.16b, v2.16b, v1.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: ssra v1.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
; CHECK-SD-BASE-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x8, d0
; CHECK-SD-BASE-NEXT: add x0, x8, x0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d1, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x8, d0
; CHECK-SD-DOT-NEXT: add x0, x8, x0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: add x0, x8, x0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v1.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: add x0, x8, x0
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-NEXT: shl v0.2d, v0.2d, #56
; CHECK-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: addp d0, v0.2d
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: add x0, x8, x0
; CHECK-NEXT: ret
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%r = add i64 %z, %a
ret i64 %r
}
define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i32:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i32:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
%z = add i32 %z1, %z2
ret i32 %z
}
define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i32_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i32> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i32_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i32> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: addv s0, v1.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-DOT-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: addv s0, v1.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-DOT-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i16> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i16> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_udot_v8i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v2.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_udot_v8i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_udot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: umlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_udot_v16i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: udot v2.4s, v1.16b, v0.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_udot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: umull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: umull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: umlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: umlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%0 = zext <16 x i8> %a to <16 x i32>
%1 = zext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_udot_v24i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr q0, [x0]
; CHECK-BASE-NEXT: ldr q1, [x1]
; CHECK-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-BASE-NEXT: umull v6.4s, v3.4h, v2.4h
; CHECK-BASE-NEXT: umull2 v2.4s, v3.8h, v2.8h
; CHECK-BASE-NEXT: ushll v3.8h, v4.8b, #0
; CHECK-BASE-NEXT: ushll v4.8h, v5.8b, #0
; CHECK-BASE-NEXT: umlal2 v2.4s, v4.8h, v3.8h
; CHECK-BASE-NEXT: umlal v6.4s, v4.4h, v3.4h
; CHECK-BASE-NEXT: umlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: umlal v6.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: udot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: udot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: udot v3.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: udot v0.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = zext <24 x i8> %a to <24 x i32>
%1 = zext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_udot_v48i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-BASE-NEXT: ushll2 v16.8h, v2.16b, #0
; CHECK-BASE-NEXT: ushll2 v6.8h, v0.16b, #0
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll2 v17.8h, v7.16b, #0
; CHECK-BASE-NEXT: ushll2 v5.8h, v1.16b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull2 v18.4s, v6.8h, v5.8h
; CHECK-BASE-NEXT: umull v19.4s, v0.4h, v1.4h
; CHECK-BASE-NEXT: umull v5.4s, v6.4h, v5.4h
; CHECK-BASE-NEXT: umull2 v0.4s, v0.8h, v1.8h
; CHECK-BASE-NEXT: ushll v1.8h, v2.8b, #0
; CHECK-BASE-NEXT: ushll v2.8h, v7.8b, #0
; CHECK-BASE-NEXT: ushll2 v6.8h, v3.16b, #0
; CHECK-BASE-NEXT: ushll2 v7.8h, v4.16b, #0
; CHECK-BASE-NEXT: umlal2 v18.4s, v17.8h, v16.8h
; CHECK-BASE-NEXT: umlal v5.4s, v17.4h, v16.4h
; CHECK-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: ushll v1.8h, v3.8b, #0
; CHECK-BASE-NEXT: ushll v2.8h, v4.8b, #0
; CHECK-BASE-NEXT: umlal2 v18.4s, v7.8h, v6.8h
; CHECK-BASE-NEXT: umlal v5.4s, v7.4h, v6.4h
; CHECK-BASE-NEXT: umlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: umlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: udot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: udot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0]
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40]
; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40]
; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25]
; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41]
; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41]
; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10]
; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26]
; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42]
; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42]
; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27]
; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43]
; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43]
; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12]
; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28]
; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44]
; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44]
; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29]
; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45]
; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45]
; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14]
; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30]
; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46]
; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46]
; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31]
; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47]
; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47]
; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v0.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: udot v7.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v16.4s, v6.16b, v3.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v7.4s
; CHECK-GI-DOT-NEXT: addv s2, v16.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w0, w8, w10
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = zext <48 x i8> %a to <48 x i32>
%1 = zext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_sdot_v8i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: smull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v2.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_sdot_v8i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.2s, v1.8b, v0.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%0 = sext <8 x i8> %a to <8 x i32>
%1 = sext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-SD-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-SD-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-SD-BASE-NEXT: smlal v4.4s, v1.4h, v0.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v4.4s, v2.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: test_sdot_v16i8:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: sdot v2.4s, v1.16b, v0.16b
; CHECK-DOT-NEXT: addv s0, v2.4s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: smull v4.4s, v3.4h, v2.4h
; CHECK-GI-BASE-NEXT: smull v5.4s, v1.4h, v0.4h
; CHECK-GI-BASE-NEXT: smlal2 v4.4s, v3.8h, v2.8h
; CHECK-GI-BASE-NEXT: smlal2 v5.4s, v1.8h, v0.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v5.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
entry:
%0 = sext <16 x i8> %a to <16 x i32>
%1 = sext <16 x i8> %b to <16 x i32>
%2 = mul nuw nsw <16 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_sdot_v24i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr q0, [x0]
; CHECK-BASE-NEXT: ldr q1, [x1]
; CHECK-BASE-NEXT: ldr d4, [x0, #16]
; CHECK-BASE-NEXT: ldr d5, [x1, #16]
; CHECK-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-BASE-NEXT: smull v6.4s, v3.4h, v2.4h
; CHECK-BASE-NEXT: smull2 v2.4s, v3.8h, v2.8h
; CHECK-BASE-NEXT: sshll v3.8h, v4.8b, #0
; CHECK-BASE-NEXT: sshll v4.8h, v5.8b, #0
; CHECK-BASE-NEXT: smlal2 v2.4s, v4.8h, v3.8h
; CHECK-BASE-NEXT: smlal v6.4s, v4.4h, v3.4h
; CHECK-BASE-NEXT: smlal2 v2.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: smlal v6.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: add v0.4s, v6.4s, v2.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v1.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q2, [x0]
; CHECK-SD-DOT-NEXT: ldr q3, [x1]
; CHECK-SD-DOT-NEXT: ldr d4, [x0, #16]
; CHECK-SD-DOT-NEXT: ldr d5, [x1, #16]
; CHECK-SD-DOT-NEXT: sdot v1.2s, v5.8b, v4.8b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v3.16b, v2.16b
; CHECK-SD-DOT-NEXT: addp v1.2s, v1.2s, v1.2s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w8, s1
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: add w0, w9, w8
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #8]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #1]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #8]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #17]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b19, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #2]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #10]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #10]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #18]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #12]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #14]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v19.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b19, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v19.b[0]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v4.d[1], v6.d[0]
; CHECK-GI-DOT-NEXT: mov v1.d[1], v3.d[0]
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v2.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: mov v5.d[1], v0.d[0]
; CHECK-GI-DOT-NEXT: sdot v3.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: sdot v0.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.4s, v3.4s, v0.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <24 x i8>, ptr %p1
%b = load <24 x i8>, ptr %p2
%0 = sext <24 x i8> %a to <24 x i32>
%1 = sext <24 x i8> %b to <24 x i32>
%2 = mul nuw nsw <24 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
ret i32 %3
}
define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
; CHECK-BASE-LABEL: test_sdot_v48i8:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldp q0, q4, [x1]
; CHECK-BASE-NEXT: ldr q2, [x0, #32]
; CHECK-BASE-NEXT: ldp q1, q3, [x0]
; CHECK-BASE-NEXT: ldr q7, [x1, #32]
; CHECK-BASE-NEXT: sshll2 v16.8h, v2.16b, #0
; CHECK-BASE-NEXT: sshll2 v6.8h, v0.16b, #0
; CHECK-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: sshll2 v17.8h, v7.16b, #0
; CHECK-BASE-NEXT: sshll2 v5.8h, v1.16b, #0
; CHECK-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: smull2 v18.4s, v6.8h, v5.8h
; CHECK-BASE-NEXT: smull v19.4s, v0.4h, v1.4h
; CHECK-BASE-NEXT: smull v5.4s, v6.4h, v5.4h
; CHECK-BASE-NEXT: smull2 v0.4s, v0.8h, v1.8h
; CHECK-BASE-NEXT: sshll v1.8h, v2.8b, #0
; CHECK-BASE-NEXT: sshll v2.8h, v7.8b, #0
; CHECK-BASE-NEXT: sshll2 v6.8h, v3.16b, #0
; CHECK-BASE-NEXT: sshll2 v7.8h, v4.16b, #0
; CHECK-BASE-NEXT: smlal2 v18.4s, v17.8h, v16.8h
; CHECK-BASE-NEXT: smlal v5.4s, v17.4h, v16.4h
; CHECK-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: sshll v1.8h, v3.8b, #0
; CHECK-BASE-NEXT: sshll v2.8h, v4.8b, #0
; CHECK-BASE-NEXT: smlal2 v18.4s, v7.8h, v6.8h
; CHECK-BASE-NEXT: smlal v5.4s, v7.4h, v6.4h
; CHECK-BASE-NEXT: smlal v19.4s, v2.4h, v1.4h
; CHECK-BASE-NEXT: smlal2 v0.4s, v2.8h, v1.8h
; CHECK-BASE-NEXT: add v1.4s, v19.4s, v5.4s
; CHECK-BASE-NEXT: add v0.4s, v0.4s, v18.4s
; CHECK-BASE-NEXT: add v0.4s, v1.4s, v0.4s
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: ldr q1, [x0, #32]
; CHECK-SD-DOT-NEXT: ldr q2, [x1, #32]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: ldp q3, q1, [x0]
; CHECK-SD-DOT-NEXT: ldp q4, q2, [x1]
; CHECK-SD-DOT-NEXT: sdot v0.4s, v4.16b, v3.16b
; CHECK-SD-DOT-NEXT: sdot v0.4s, v2.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ldr b1, [x0]
; CHECK-GI-DOT-NEXT: ldr b5, [x0, #1]
; CHECK-GI-DOT-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: ldr b2, [x0, #16]
; CHECK-GI-DOT-NEXT: ldr b6, [x0, #17]
; CHECK-GI-DOT-NEXT: ldr b4, [x1]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #1]
; CHECK-GI-DOT-NEXT: mov v1.b[1], v5.b[0]
; CHECK-GI-DOT-NEXT: ldr b5, [x1, #16]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #17]
; CHECK-GI-DOT-NEXT: mov v2.b[1], v6.b[0]
; CHECK-GI-DOT-NEXT: ldr b3, [x0, #32]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #33]
; CHECK-GI-DOT-NEXT: mov v4.b[1], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b6, [x1, #32]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #33]
; CHECK-GI-DOT-NEXT: mov v5.b[1], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #2]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #18]
; CHECK-GI-DOT-NEXT: mov v3.b[1], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #2]
; CHECK-GI-DOT-NEXT: mov v6.b[1], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #18]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #34]
; CHECK-GI-DOT-NEXT: mov v2.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x1, #34]
; CHECK-GI-DOT-NEXT: mov v4.b[2], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #3]
; CHECK-GI-DOT-NEXT: mov v5.b[2], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #19]
; CHECK-GI-DOT-NEXT: mov v3.b[2], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #19]
; CHECK-GI-DOT-NEXT: mov v6.b[2], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #3]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #35]
; CHECK-GI-DOT-NEXT: mov v2.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #35]
; CHECK-GI-DOT-NEXT: mov v4.b[3], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #4]
; CHECK-GI-DOT-NEXT: mov v5.b[3], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #20]
; CHECK-GI-DOT-NEXT: mov v3.b[3], v18.b[0]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #36]
; CHECK-GI-DOT-NEXT: mov v6.b[3], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #4]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #20]
; CHECK-GI-DOT-NEXT: mov v2.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #36]
; CHECK-GI-DOT-NEXT: mov v4.b[4], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #5]
; CHECK-GI-DOT-NEXT: mov v5.b[4], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #21]
; CHECK-GI-DOT-NEXT: mov v3.b[4], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[4], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #5]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #21]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #37]
; CHECK-GI-DOT-NEXT: mov v2.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #37]
; CHECK-GI-DOT-NEXT: mov v4.b[5], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #6]
; CHECK-GI-DOT-NEXT: mov v5.b[5], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #22]
; CHECK-GI-DOT-NEXT: mov v3.b[5], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[5], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #6]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #22]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #38]
; CHECK-GI-DOT-NEXT: mov v2.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #38]
; CHECK-GI-DOT-NEXT: mov v4.b[6], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #7]
; CHECK-GI-DOT-NEXT: mov v5.b[6], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #23]
; CHECK-GI-DOT-NEXT: mov v3.b[6], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[6], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #7]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #23]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #39]
; CHECK-GI-DOT-NEXT: mov v2.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #39]
; CHECK-GI-DOT-NEXT: mov v4.b[7], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #8]
; CHECK-GI-DOT-NEXT: mov v5.b[7], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #24]
; CHECK-GI-DOT-NEXT: mov v3.b[7], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[7], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #8]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #24]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #40]
; CHECK-GI-DOT-NEXT: mov v2.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #40]
; CHECK-GI-DOT-NEXT: mov v4.b[8], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #9]
; CHECK-GI-DOT-NEXT: mov v5.b[8], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #25]
; CHECK-GI-DOT-NEXT: mov v3.b[8], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[8], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #9]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #25]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #41]
; CHECK-GI-DOT-NEXT: mov v2.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #41]
; CHECK-GI-DOT-NEXT: mov v4.b[9], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #10]
; CHECK-GI-DOT-NEXT: mov v5.b[9], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #26]
; CHECK-GI-DOT-NEXT: mov v3.b[9], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[9], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #10]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #26]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #42]
; CHECK-GI-DOT-NEXT: mov v2.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #42]
; CHECK-GI-DOT-NEXT: mov v4.b[10], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #11]
; CHECK-GI-DOT-NEXT: mov v5.b[10], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #27]
; CHECK-GI-DOT-NEXT: mov v3.b[10], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[10], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #11]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #27]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #43]
; CHECK-GI-DOT-NEXT: mov v2.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #43]
; CHECK-GI-DOT-NEXT: mov v4.b[11], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #12]
; CHECK-GI-DOT-NEXT: mov v5.b[11], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #28]
; CHECK-GI-DOT-NEXT: mov v3.b[11], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[11], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #12]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #28]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #44]
; CHECK-GI-DOT-NEXT: mov v2.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #44]
; CHECK-GI-DOT-NEXT: mov v4.b[12], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #13]
; CHECK-GI-DOT-NEXT: mov v5.b[12], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #29]
; CHECK-GI-DOT-NEXT: mov v3.b[12], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[12], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #13]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #29]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #45]
; CHECK-GI-DOT-NEXT: mov v2.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #45]
; CHECK-GI-DOT-NEXT: mov v4.b[13], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #14]
; CHECK-GI-DOT-NEXT: mov v5.b[13], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x0, #30]
; CHECK-GI-DOT-NEXT: mov v3.b[13], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[13], v16.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #14]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #30]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #46]
; CHECK-GI-DOT-NEXT: mov v2.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #46]
; CHECK-GI-DOT-NEXT: mov v4.b[14], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x0, #15]
; CHECK-GI-DOT-NEXT: mov v5.b[14], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x0, #31]
; CHECK-GI-DOT-NEXT: mov v3.b[14], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[14], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v1.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: ldr b7, [x1, #15]
; CHECK-GI-DOT-NEXT: ldr b17, [x1, #31]
; CHECK-GI-DOT-NEXT: ldr b18, [x0, #47]
; CHECK-GI-DOT-NEXT: mov v2.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: ldr b16, [x1, #47]
; CHECK-GI-DOT-NEXT: mov v4.b[15], v7.b[0]
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: mov v5.b[15], v17.b[0]
; CHECK-GI-DOT-NEXT: mov v3.b[15], v18.b[0]
; CHECK-GI-DOT-NEXT: mov v6.b[15], v16.b[0]
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v0.4s, v4.16b, v1.16b
; CHECK-GI-DOT-NEXT: sdot v7.4s, v5.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v16.4s, v6.16b, v3.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v7.4s
; CHECK-GI-DOT-NEXT: addv s2, v16.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w0, w8, w10
; CHECK-GI-DOT-NEXT: ret
entry:
%a = load <48 x i8>, ptr %p1
%b = load <48 x i8>, ptr %p2
%0 = sext <48 x i8> %a to <48 x i32>
%1 = sext <48 x i8> %b to <48 x i32>
%2 = mul nuw nsw <48 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
ret i32 %3
}
; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
; CHECK-BASE-LABEL: test_udot_v8i8_multi_use:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-BASE-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-BASE-NEXT: mov v3.16b, v2.16b
; CHECK-BASE-NEXT: fmov w8, s2
; CHECK-BASE-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-BASE-NEXT: addv s0, v3.4s
; CHECK-BASE-NEXT: fmov w9, s0
; CHECK-BASE-NEXT: add w0, w9, w8
; CHECK-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v0.8b
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: umull v0.4s, v1.4h, v0.4h
; CHECK-SD-DOT-NEXT: addp v2.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w9, s0
; CHECK-SD-DOT-NEXT: fmov w8, s2
; CHECK-SD-DOT-NEXT: add w0, w8, w9
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-DOT-LABEL: test_udot_v8i8_multi_use:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: umull v2.4s, v1.4h, v0.4h
; CHECK-GI-DOT-NEXT: mov v3.16b, v2.16b
; CHECK-GI-DOT-NEXT: fmov w8, s2
; CHECK-GI-DOT-NEXT: umlal2 v3.4s, v1.8h, v0.8h
; CHECK-GI-DOT-NEXT: addv s0, v3.4s
; CHECK-GI-DOT-NEXT: fmov w9, s0
; CHECK-GI-DOT-NEXT: add w0, w9, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%0 = zext <8 x i8> %a to <8 x i32>
%1 = zext <8 x i8> %b to <8 x i32>
%2 = mul nuw nsw <8 x i32> %1, %0
%3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
%4 = extractelement <8 x i32> %2, i32 0
%5 = add nuw nsw i32 %3, %4
ret i32 %5
}
define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i16:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.8h, v0.8h, v1.8h
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i16:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
%z = add i16 %z1, %z2
ret i16 %z
}
define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i16> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: saddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: sadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i16_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll v2.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v3.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v2.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v3.2d, v1.4s
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i16> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-BASE-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x0000000000ffff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-BASE-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-BASE-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #48
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #48
; CHECK-SD-DOT-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-SD-DOT-NEXT: ssra v0.2d, v1.2d, #48
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-BASE-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i16_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #48
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #48
; CHECK-GI-DOT-NEXT: sshr v0.2d, v0.2d, #48
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #48
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i16> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: uaddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: uaddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: udot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.4s, v4.4s, v2.8h
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v5.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v3.4s, v6.4s, v3.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v7.4s, v1.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v3.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: udot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = zext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.4s, v0.8h, v2.8h
; CHECK-SD-BASE-NEXT: saddl v0.4s, v0.4h, v2.4h
; CHECK-SD-BASE-NEXT: saddl2 v2.4s, v1.8h, v3.8h
; CHECK-SD-BASE-NEXT: saddl v1.4s, v1.4h, v3.4h
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-SD-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.16b, #1
; CHECK-SD-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-SD-DOT-NEXT: sdot v3.4s, v0.16b, v2.16b
; CHECK-SD-DOT-NEXT: addv s0, v3.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.4s, v4.4s, v2.8h
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v5.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v3.4s, v6.4s, v3.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v7.4s, v1.8h
; CHECK-GI-BASE-NEXT: add v0.4s, v2.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v3.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.16b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.4s, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: sdot v3.4s, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v4.4s
; CHECK-GI-DOT-NEXT: addv s1, v3.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%yy = sext <16 x i8> %y to <16 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: udot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: udot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = zext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: saddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: sadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: addv s0, v1.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v3.8b, #1
; CHECK-SD-DOT-NEXT: sdot v2.2s, v1.8b, v3.8b
; CHECK-SD-DOT-NEXT: sdot v2.2s, v0.8b, v3.8b
; CHECK-SD-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.4s, v2.4s, v0.8h
; CHECK-GI-BASE-NEXT: saddw2 v1.4s, v3.4s, v1.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.8b, #1
; CHECK-GI-DOT-NEXT: movi v3.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: sdot v4.2s, v0.8b, v2.8b
; CHECK-GI-DOT-NEXT: sdot v3.2s, v1.8b, v2.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v4.2s, v4.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v3.2s, v3.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%yy = sext <8 x i8> %y to <8 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: uaddl v0.4s, v0.4h, v1.4h
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x0000ff000000ff
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = zext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-BASE-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-BASE-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-BASE-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: shl v0.4s, v0.4s, #24
; CHECK-SD-DOT-NEXT: shl v1.4s, v1.4s, #24
; CHECK-SD-DOT-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-SD-DOT-NEXT: ssra v0.4s, v1.4s, #24
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: shl v0.4s, v0.4s, #24
; CHECK-GI-BASE-NEXT: shl v1.4s, v1.4s, #24
; CHECK-GI-BASE-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-GI-BASE-NEXT: sshr v1.4s, v1.4s, #24
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i32_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: shl v0.4s, v0.4s, #24
; CHECK-GI-DOT-NEXT: shl v1.4s, v1.4s, #24
; CHECK-GI-DOT-NEXT: sshr v0.4s, v0.4s, #24
; CHECK-GI-DOT-NEXT: sshr v1.4s, v1.4s, #24
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: addv s1, v1.4s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%yy = sext <4 x i8> %y to <4 x i32>
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
%z = add i32 %z1, %z2
ret i32 %z
}
define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-BASE-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v1.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.8h, v1.16b
; CHECK-SD-DOT-NEXT: uadalp v1.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v1.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-BASE-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: uaddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-DOT-NEXT: uaddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = zext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-BASE-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-BASE-NEXT: addv h0, v1.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddlp v1.8h, v1.16b
; CHECK-SD-DOT-NEXT: sadalp v1.8h, v0.16b
; CHECK-SD-DOT-NEXT: addv h0, v1.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: saddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-BASE-NEXT: saddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: saddw2 v0.8h, v2.8h, v0.16b
; CHECK-GI-DOT-NEXT: saddw2 v1.8h, v3.8h, v1.16b
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%yy = sext <16 x i8> %y to <16 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddl v0.8h, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: and w0, w8, #0xffff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: and w0, w8, #0xffff
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = zext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-BASE-NEXT: addv h0, v0.8h
; CHECK-SD-BASE-NEXT: smov w0, v0.h[0]
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: saddl v0.8h, v0.8b, v1.8b
; CHECK-SD-DOT-NEXT: addv h0, v0.8h
; CHECK-SD-DOT-NEXT: smov w0, v0.h[0]
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: addv h0, v0.8h
; CHECK-GI-BASE-NEXT: addv h1, v1.8h
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxth
; CHECK-GI-BASE-NEXT: sxth w0, w8
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i16_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: addv h0, v0.8h
; CHECK-GI-DOT-NEXT: addv h1, v1.8h
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxth
; CHECK-GI-DOT-NEXT: sxth w0, w8
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%yy = sext <8 x i8> %y to <8 x i16>
%z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
%z = add i16 %z1, %z2
ret i16 %z
}
define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-BASE-NEXT: addv b0, v0.16b
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i8:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.16b, v0.16b, v1.16b
; CHECK-SD-DOT-NEXT: addv b0, v0.16b
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addv b0, v0.16b
; CHECK-GI-BASE-NEXT: addv b1, v1.16b
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-BASE-NEXT: and w0, w8, #0xff
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i8:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addv b0, v0.16b
; CHECK-GI-DOT-NEXT: addv b1, v1.16b
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: add w8, w9, w8, uxtb
; CHECK-GI-DOT-NEXT: and w0, w8, #0xff
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
%z = add i8 %z1, %z2
ret i8 %z
}
define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-BASE-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-BASE-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-BASE-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll2 v2.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.8h, v1.16b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v5.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v6.4s, v3.8h, #0
; CHECK-SD-DOT-NEXT: ushll2 v7.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v3.4s, v3.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v2.2d, v5.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v4.2s
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-DOT-NEXT: uaddl v6.2d, v7.2s, v6.2s
; CHECK-SD-DOT-NEXT: uaddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v16.2d, v4.2s, #0
; CHECK-GI-BASE-NEXT: ushll v17.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v18.2d, v5.2s, #0
; CHECK-GI-BASE-NEXT: ushll v19.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v20.2d, v6.2s, #0
; CHECK-GI-BASE-NEXT: ushll v21.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v22.2d, v7.2s, #0
; CHECK-GI-BASE-NEXT: ushll v23.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: ushll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.8h, v1.16b, #0
; CHECK-GI-DOT-NEXT: ushll v4.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: ushll v5.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v6.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: ushll v7.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v16.2d, v4.2s, #0
; CHECK-GI-DOT-NEXT: ushll v17.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v18.2d, v5.2s, #0
; CHECK-GI-DOT-NEXT: ushll v19.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v20.2d, v6.2s, #0
; CHECK-GI-DOT-NEXT: ushll v21.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v22.2d, v7.2s, #0
; CHECK-GI-DOT-NEXT: ushll v23.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = zext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-BASE-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-BASE-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-BASE-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll2 v2.8h, v0.16b, #0
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.8h, v1.16b, #0
; CHECK-SD-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v5.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v6.4s, v3.8h, #0
; CHECK-SD-DOT-NEXT: sshll2 v7.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v3.4s, v3.4h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v16.2d, v5.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v2.2d, v5.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v5.2d, v0.4s, v4.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v4.2s
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v7.4s, v6.4s
; CHECK-SD-DOT-NEXT: saddl v6.2d, v7.2s, v6.2s
; CHECK-SD-DOT-NEXT: saddl2 v7.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v3.2d, v5.2d, v16.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v2.2d, v7.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v6.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v3.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-BASE-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v16.2d, v4.2s, #0
; CHECK-GI-BASE-NEXT: sshll v17.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v18.2d, v5.2s, #0
; CHECK-GI-BASE-NEXT: sshll v19.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v20.2d, v6.2s, #0
; CHECK-GI-BASE-NEXT: sshll v21.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v22.2d, v7.2s, #0
; CHECK-GI-BASE-NEXT: sshll v23.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v2.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.8h, v0.16b, #0
; CHECK-GI-DOT-NEXT: sshll v3.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.8h, v1.16b, #0
; CHECK-GI-DOT-NEXT: sshll v4.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: sshll v5.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v6.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: sshll v7.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v16.2d, v4.2s, #0
; CHECK-GI-DOT-NEXT: sshll v17.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v18.2d, v5.2s, #0
; CHECK-GI-DOT-NEXT: sshll v19.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v20.2d, v6.2s, #0
; CHECK-GI-DOT-NEXT: sshll v21.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v22.2d, v7.2s, #0
; CHECK-GI-DOT-NEXT: sshll v23.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v4.2d, v16.2d, v4.4s
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v17.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v5.2d, v18.2d, v5.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v19.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v6.2d, v20.2d, v6.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v21.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v7.2d, v22.2d, v7.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v23.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v2.2d, v4.2d, v2.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v5.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v3.2d, v6.2d, v3.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v7.2d, v1.2d
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%yy = sext <16 x i8> %y to <16 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: ushll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: uaddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: uaddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: uaddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: ushll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: ushll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: ushll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: uaddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: uaddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: uaddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: uaddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = zext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-BASE-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-BASE-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-BASE-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-BASE-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-BASE-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-BASE-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-SD-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-SD-DOT-NEXT: sshll2 v2.4s, v0.8h, #0
; CHECK-SD-DOT-NEXT: sshll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: sshll2 v3.4s, v1.8h, #0
; CHECK-SD-DOT-NEXT: sshll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: saddl2 v4.2d, v0.4s, v2.4s
; CHECK-SD-DOT-NEXT: saddl v0.2d, v0.2s, v2.2s
; CHECK-SD-DOT-NEXT: saddl2 v2.2d, v1.4s, v3.4s
; CHECK-SD-DOT-NEXT: saddl v1.2d, v1.2s, v3.2s
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v4.2d
; CHECK-SD-DOT-NEXT: add v1.2d, v1.2d, v2.2d
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-BASE-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-BASE-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-BASE-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-BASE-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-BASE-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-BASE-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-GI-DOT-NEXT: sshll v1.8h, v1.8b, #0
; CHECK-GI-DOT-NEXT: sshll v2.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: sshll v3.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: sshll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: sshll v4.2d, v2.2s, #0
; CHECK-GI-DOT-NEXT: sshll v5.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: sshll v6.2d, v3.2s, #0
; CHECK-GI-DOT-NEXT: sshll v7.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: saddw2 v2.2d, v4.2d, v2.4s
; CHECK-GI-DOT-NEXT: saddw2 v0.2d, v5.2d, v0.4s
; CHECK-GI-DOT-NEXT: saddw2 v3.2d, v6.2d, v3.4s
; CHECK-GI-DOT-NEXT: saddw2 v1.2d, v7.2d, v1.4s
; CHECK-GI-DOT-NEXT: add v0.2d, v2.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v3.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%yy = sext <8 x i8> %y to <8 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-BASE-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-BASE-NEXT: addp d0, v1.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: bic v1.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: bic v0.4h, #255, lsl #8
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: uaddlp v1.2d, v1.4s
; CHECK-SD-DOT-NEXT: uadalp v1.2d, v0.4s
; CHECK-SD-DOT-NEXT: addp d0, v1.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-GI-BASE-NEXT: and v3.16b, v3.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v4.16b, v4.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-BASE-NEXT: add v1.2d, v4.2d, v1.2d
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v3.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll v4.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-GI-DOT-NEXT: and v3.16b, v3.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v4.16b, v4.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: add v0.2d, v3.2d, v0.2d
; CHECK-GI-DOT-NEXT: add v1.2d, v4.2d, v1.2d
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = zext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-BASE-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-BASE-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-BASE-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-BASE-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-BASE-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-BASE-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: add v0.2d, v2.2d, v3.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-SD-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-SD-DOT-NEXT: ushll v2.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v3.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: ushll2 v0.2d, v0.4s, #0
; CHECK-SD-DOT-NEXT: ushll2 v1.2d, v1.4s, #0
; CHECK-SD-DOT-NEXT: shl v2.2d, v2.2d, #56
; CHECK-SD-DOT-NEXT: shl v3.2d, v3.2d, #56
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-SD-DOT-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-SD-DOT-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: add v0.2d, v2.2d, v3.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.2d, v0.4s, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.2d, v1.4s, #0
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v2.2d, v2.2d, #56
; CHECK-GI-BASE-NEXT: shl v3.2d, v3.2d, #56
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-GI-BASE-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-GI-BASE-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v2.2d
; CHECK-GI-BASE-NEXT: addp d1, v3.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v4i8_v4i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll v1.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.2d, v0.4s, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.2d, v1.4s, #0
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v2.2d, v2.2d, #56
; CHECK-GI-DOT-NEXT: shl v3.2d, v3.2d, #56
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: sshr v2.2d, v2.2d, #56
; CHECK-GI-DOT-NEXT: sshr v3.2d, v3.2d, #56
; CHECK-GI-DOT-NEXT: ssra v2.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: ssra v3.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v2.2d
; CHECK-GI-DOT-NEXT: addp d1, v3.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%yy = sext <4 x i8> %y to <4 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-BASE-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-BASE-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-BASE-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi d2, #0x0000ff000000ff
; CHECK-SD-DOT-NEXT: and v0.8b, v0.8b, v2.8b
; CHECK-SD-DOT-NEXT: and v1.8b, v1.8b, v2.8b
; CHECK-SD-DOT-NEXT: uaddl v0.2d, v0.2s, v1.2s
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-BASE-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v2.2d, #0x000000000000ff
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-DOT-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = zext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-BASE-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-SD-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-SD-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-SD-DOT-NEXT: ssra v0.2d, v1.2d, #56
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-BASE-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-BASE-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-BASE-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i8_v2i64_sext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-GI-DOT-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-GI-DOT-NEXT: shl v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: shl v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: sshr v0.2d, v0.2d, #56
; CHECK-GI-DOT-NEXT: sshr v1.2d, v1.2d, #56
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%yy = sext <2 x i8> %y to <2 x i64>
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-SD-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-SD-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-SD-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: saddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: sadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: movi v4.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: movi v5.8b, #1
; CHECK-SD-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-SD-DOT-NEXT: udot v6.2s, v1.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v3.8b, v5.8b
; CHECK-SD-DOT-NEXT: udot v6.2s, v0.8b, v5.8b
; CHECK-SD-DOT-NEXT: sdot v4.2s, v2.8b, v5.8b
; CHECK-SD-DOT-NEXT: add v0.2s, v6.2s, v4.2s
; CHECK-SD-DOT-NEXT: addp v0.2s, v0.2s, v0.2s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v0.8h, v0.8b, #0
; CHECK-GI-BASE-NEXT: ushll v1.8h, v1.8b, #0
; CHECK-GI-BASE-NEXT: sshll v2.8h, v2.8b, #0
; CHECK-GI-BASE-NEXT: sshll v3.8h, v3.8b, #0
; CHECK-GI-BASE-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: sshll v6.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: sshll v7.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: uaddw2 v0.4s, v4.4s, v0.8h
; CHECK-GI-BASE-NEXT: uaddw2 v1.4s, v5.4s, v1.8h
; CHECK-GI-BASE-NEXT: saddw2 v2.4s, v6.4s, v2.8h
; CHECK-GI-BASE-NEXT: saddw2 v3.4s, v7.4s, v3.8h
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: addv s1, v1.4s
; CHECK-GI-BASE-NEXT: addv s2, v2.4s
; CHECK-GI-BASE-NEXT: addv s3, v3.4s
; CHECK-GI-BASE-NEXT: fmov w8, s0
; CHECK-GI-BASE-NEXT: fmov w9, s1
; CHECK-GI-BASE-NEXT: fmov w10, s2
; CHECK-GI-BASE-NEXT: fmov w11, s3
; CHECK-GI-BASE-NEXT: add w8, w8, w9
; CHECK-GI-BASE-NEXT: add w9, w10, w11
; CHECK-GI-BASE-NEXT: add w0, w8, w9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: movi v4.8b, #1
; CHECK-GI-DOT-NEXT: movi v5.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v6.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v7.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: movi v16.2d, #0000000000000000
; CHECK-GI-DOT-NEXT: udot v5.2s, v0.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v6.2s, v3.8b, v4.8b
; CHECK-GI-DOT-NEXT: udot v7.2s, v1.8b, v4.8b
; CHECK-GI-DOT-NEXT: sdot v16.2s, v2.8b, v4.8b
; CHECK-GI-DOT-NEXT: addp v0.2s, v5.2s, v5.2s
; CHECK-GI-DOT-NEXT: addp v3.2s, v6.2s, v6.2s
; CHECK-GI-DOT-NEXT: addp v1.2s, v7.2s, v7.2s
; CHECK-GI-DOT-NEXT: addp v2.2s, v16.2s, v16.2s
; CHECK-GI-DOT-NEXT: fmov w8, s0
; CHECK-GI-DOT-NEXT: fmov w11, s3
; CHECK-GI-DOT-NEXT: fmov w9, s1
; CHECK-GI-DOT-NEXT: fmov w10, s2
; CHECK-GI-DOT-NEXT: add w8, w8, w9
; CHECK-GI-DOT-NEXT: add w9, w10, w11
; CHECK-GI-DOT-NEXT: add w0, w8, w9
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i8> %ax to <8 x i32>
%az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
%ayy = zext <8 x i8> %ay to <8 x i32>
%az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
%az = add i32 %az1, %az2
%bxx = sext <8 x i8> %bx to <8 x i32>
%bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
%byy = sext <8 x i8> %by to <8 x i32>
%bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
%bz = add i32 %bz1, %bz2
%z = add i32 %az, %bz
ret i32 %z
}
define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
; CHECK-SD-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-BASE-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-BASE-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-BASE-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-BASE-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-BASE-NEXT: addv s0, v0.4s
; CHECK-SD-BASE-NEXT: fmov w0, s0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: uaddlp v1.4s, v1.8h
; CHECK-SD-DOT-NEXT: uaddlp v3.4s, v3.8h
; CHECK-SD-DOT-NEXT: uadalp v1.4s, v0.8h
; CHECK-SD-DOT-NEXT: uadalp v3.4s, v2.8h
; CHECK-SD-DOT-NEXT: add v0.4s, v3.4s, v1.4s
; CHECK-SD-DOT-NEXT: addv s0, v0.4s
; CHECK-SD-DOT-NEXT: fmov w0, s0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-BASE-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-BASE-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-BASE-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-BASE-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-BASE-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-BASE-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-BASE-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-BASE-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-BASE-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-BASE-NEXT: addv s0, v0.4s
; CHECK-GI-BASE-NEXT: fmov w0, s0
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: ushll v4.4s, v0.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v0.4s, v0.8h, #0
; CHECK-GI-DOT-NEXT: ushll v5.4s, v1.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v1.4s, v1.8h, #0
; CHECK-GI-DOT-NEXT: ushll v6.4s, v2.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v2.4s, v2.8h, #0
; CHECK-GI-DOT-NEXT: ushll v7.4s, v3.4h, #0
; CHECK-GI-DOT-NEXT: ushll2 v3.4s, v3.8h, #0
; CHECK-GI-DOT-NEXT: add v0.4s, v4.4s, v0.4s
; CHECK-GI-DOT-NEXT: add v1.4s, v5.4s, v1.4s
; CHECK-GI-DOT-NEXT: add v2.4s, v6.4s, v2.4s
; CHECK-GI-DOT-NEXT: add v3.4s, v7.4s, v3.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-DOT-NEXT: add v1.4s, v2.4s, v3.4s
; CHECK-GI-DOT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-GI-DOT-NEXT: addv s0, v0.4s
; CHECK-GI-DOT-NEXT: fmov w0, s0
; CHECK-GI-DOT-NEXT: ret
entry:
%axx = zext <8 x i16> %ax to <8 x i32>
%s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%axs = add <4 x i32> %s1h, %s1l
%ayy = zext <8 x i16> %ay to <8 x i32>
%s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%ays = add <4 x i32> %s2h, %s2l
%az = add <4 x i32> %axs, %ays
%bxx = zext <8 x i16> %bx to <8 x i32>
%s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bxs = add <4 x i32> %s3h, %s3l
%byy = zext <8 x i16> %by to <8 x i32>
%s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bys = add <4 x i32> %s4h, %s4l
%bz = add <4 x i32> %bxs, %bys
%z = add <4 x i32> %az, %bz
%z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
ret i32 %z2
}
define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-SD-BASE-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD-BASE: // %bb.0: // %entry
; CHECK-SD-BASE-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-BASE-NEXT: addp d0, v0.2d
; CHECK-SD-BASE-NEXT: fmov x0, d0
; CHECK-SD-BASE-NEXT: ret
;
; CHECK-SD-DOT-LABEL: add_pair_v2i64_v2i64:
; CHECK-SD-DOT: // %bb.0: // %entry
; CHECK-SD-DOT-NEXT: add v0.2d, v0.2d, v1.2d
; CHECK-SD-DOT-NEXT: addp d0, v0.2d
; CHECK-SD-DOT-NEXT: fmov x0, d0
; CHECK-SD-DOT-NEXT: ret
;
; CHECK-GI-BASE-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI-BASE: // %bb.0: // %entry
; CHECK-GI-BASE-NEXT: addp d0, v0.2d
; CHECK-GI-BASE-NEXT: addp d1, v1.2d
; CHECK-GI-BASE-NEXT: fmov x8, d0
; CHECK-GI-BASE-NEXT: fmov x9, d1
; CHECK-GI-BASE-NEXT: add x0, x8, x9
; CHECK-GI-BASE-NEXT: ret
;
; CHECK-GI-DOT-LABEL: add_pair_v2i64_v2i64:
; CHECK-GI-DOT: // %bb.0: // %entry
; CHECK-GI-DOT-NEXT: addp d0, v0.2d
; CHECK-GI-DOT-NEXT: addp d1, v1.2d
; CHECK-GI-DOT-NEXT: fmov x8, d0
; CHECK-GI-DOT-NEXT: fmov x9, d1
; CHECK-GI-DOT-NEXT: add x0, x8, x9
; CHECK-GI-DOT-NEXT: ret
entry:
%z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
%z = add i64 %z1, %z2
ret i64 %z
}
define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
; CHECK-BASE-LABEL: full:
; CHECK-BASE: // %bb.0: // %entry
; CHECK-BASE-NEXT: ldr d0, [x2]
; CHECK-BASE-NEXT: ldr d1, [x0]
; CHECK-BASE-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-BASE-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-BASE-NEXT: sxtw x8, w3
; CHECK-BASE-NEXT: sxtw x9, w1
; CHECK-BASE-NEXT: uabdl v0.8h, v1.8b, v0.8b
; CHECK-BASE-NEXT: add x11, x2, x8
; CHECK-BASE-NEXT: add x10, x0, x9
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uaddlp v0.4s, v0.8h
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: add x11, x11, x8
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: add x10, x10, x9
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11]
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10]
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: ldr d2, [x11, x8]
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: ldr d1, [x10, x9]
; CHECK-BASE-NEXT: uabdl v1.8h, v1.8b, v2.8b
; CHECK-BASE-NEXT: uadalp v0.4s, v1.8h
; CHECK-BASE-NEXT: addv s0, v0.4s
; CHECK-BASE-NEXT: fmov w0, s0
; CHECK-BASE-NEXT: ret
;
; CHECK-DOT-LABEL: full:
; CHECK-DOT: // %bb.0: // %entry
; CHECK-DOT-NEXT: ldr d0, [x0]
; CHECK-DOT-NEXT: ldr d1, [x2]
; CHECK-DOT-NEXT: // kill: def $w3 killed $w3 def $x3
; CHECK-DOT-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-DOT-NEXT: sxtw x8, w3
; CHECK-DOT-NEXT: sxtw x9, w1
; CHECK-DOT-NEXT: movi v2.2d, #0000000000000000
; CHECK-DOT-NEXT: movi v3.8b, #1
; CHECK-DOT-NEXT: uabd v0.8b, v0.8b, v1.8b
; CHECK-DOT-NEXT: add x11, x2, x8
; CHECK-DOT-NEXT: add x10, x0, x9
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: add x10, x10, x9
; CHECK-DOT-NEXT: add x11, x11, x8
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10]
; CHECK-DOT-NEXT: ldr d4, [x11]
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: ldr d1, [x10, x9]
; CHECK-DOT-NEXT: ldr d4, [x11, x8]
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: uabd v0.8b, v1.8b, v4.8b
; CHECK-DOT-NEXT: udot v2.2s, v0.8b, v3.8b
; CHECK-DOT-NEXT: addp v0.2s, v2.2s, v2.2s
; CHECK-DOT-NEXT: fmov w0, s0
; CHECK-DOT-NEXT: ret
entry:
%idx.ext8 = sext i32 %s2 to i64
%idx.ext = sext i32 %s1 to i64
%0 = load <8 x i8>, ptr %p1, align 1
%1 = zext <8 x i8> %0 to <8 x i32>
%2 = load <8 x i8>, ptr %p2, align 1
%3 = zext <8 x i8> %2 to <8 x i32>
%4 = sub nsw <8 x i32> %1, %3
%5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
%6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
%add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
%add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
%7 = load <8 x i8>, ptr %add.ptr, align 1
%8 = zext <8 x i8> %7 to <8 x i32>
%9 = load <8 x i8>, ptr %add.ptr9, align 1
%10 = zext <8 x i8> %9 to <8 x i32>
%11 = sub nsw <8 x i32> %8, %10
%12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
%13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
%op.rdx.1 = add i32 %13, %6
%add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
%add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
%14 = load <8 x i8>, ptr %add.ptr.1, align 1
%15 = zext <8 x i8> %14 to <8 x i32>
%16 = load <8 x i8>, ptr %add.ptr9.1, align 1
%17 = zext <8 x i8> %16 to <8 x i32>
%18 = sub nsw <8 x i32> %15, %17
%19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
%20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
%op.rdx.2 = add i32 %20, %op.rdx.1
%add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
%add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
%21 = load <8 x i8>, ptr %add.ptr.2, align 1
%22 = zext <8 x i8> %21 to <8 x i32>
%23 = load <8 x i8>, ptr %add.ptr9.2, align 1
%24 = zext <8 x i8> %23 to <8 x i32>
%25 = sub nsw <8 x i32> %22, %24
%26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
%27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
%op.rdx.3 = add i32 %27, %op.rdx.2
%add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
%add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
%28 = load <8 x i8>, ptr %add.ptr.3, align 1
%29 = zext <8 x i8> %28 to <8 x i32>
%30 = load <8 x i8>, ptr %add.ptr9.3, align 1
%31 = zext <8 x i8> %30 to <8 x i32>
%32 = sub nsw <8 x i32> %29, %31
%33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
%34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
%op.rdx.4 = add i32 %34, %op.rdx.3
%add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
%add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
%35 = load <8 x i8>, ptr %add.ptr.4, align 1
%36 = zext <8 x i8> %35 to <8 x i32>
%37 = load <8 x i8>, ptr %add.ptr9.4, align 1
%38 = zext <8 x i8> %37 to <8 x i32>
%39 = sub nsw <8 x i32> %36, %38
%40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
%41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
%op.rdx.5 = add i32 %41, %op.rdx.4
%add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
%add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
%42 = load <8 x i8>, ptr %add.ptr.5, align 1
%43 = zext <8 x i8> %42 to <8 x i32>
%44 = load <8 x i8>, ptr %add.ptr9.5, align 1
%45 = zext <8 x i8> %44 to <8 x i32>
%46 = sub nsw <8 x i32> %43, %45
%47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
%48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
%op.rdx.6 = add i32 %48, %op.rdx.5
%add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
%add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
%49 = load <8 x i8>, ptr %add.ptr.6, align 1
%50 = zext <8 x i8> %49 to <8 x i32>
%51 = load <8 x i8>, ptr %add.ptr9.6, align 1
%52 = zext <8 x i8> %51 to <8 x i32>
%53 = sub nsw <8 x i32> %50, %52
%54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
%55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
%op.rdx.7 = add i32 %55, %op.rdx.6
ret i32 %op.rdx.7
}
declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)