[X86][SSE] isHorizontalBinOp - add support for target shuffles
Add target shuffle decoding to isHorizontalBinOp as well as ISD::VECTOR_SHUFFLE support.
This does mean we can go through bitcasts so we need to bitcast the extracted args to ensure they are the correct type
Fixes PR39936 and should help with PR39920/PR39921
Differential Revision: https://reviews.llvm.org/D61245
llvm-svn: 359491
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c120ec1..4c708db 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39248,51 +39248,65 @@
// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
// which is A horizontal-op B.
- // At least one of the operands should be a vector shuffle.
- if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
- RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
- return false;
-
MVT VT = LHS.getSimpleValueType();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for horizontal add/sub");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1,
+ SmallVectorImpl<int> &ShuffleMask) {
+ if (Op.getOpcode() == ISD::VECTOR_SHUFFLE) {
+ if (!Op.getOperand(0).isUndef())
+ N0 = Op.getOperand(0);
+ if (!Op.getOperand(1).isUndef())
+ N1 = Op.getOperand(1);
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+ ShuffleMask.append(Mask.begin(), Mask.end());
+ return;
+ }
+ bool IsUnary;
+ SmallVector<SDValue, 2> SrcOps;
+ SmallVector<int, 16> SrcShuffleMask;
+ SDValue BC = peekThroughBitcasts(Op);
+ if (isTargetShuffle(BC.getOpcode()) &&
+ getTargetShuffleMask(BC.getNode(), BC.getSimpleValueType(), false,
+ SrcOps, SrcShuffleMask, IsUnary) &&
+ SrcOps.size() <= 2 && SrcShuffleMask.size() == NumElts) {
+ N0 = SrcOps.size() > 0 ? SrcOps[0] : SDValue();
+ N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue();
+ ShuffleMask.append(SrcShuffleMask.begin(), SrcShuffleMask.end());
+ }
+ };
// View LHS in the form
// LHS = VECTOR_SHUFFLE A, B, LMask
// If LHS is not a shuffle, then pretend it is the identity shuffle:
// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
// NOTE: A default initialized SDValue represents an UNDEF of type VT.
- unsigned NumElts = VT.getVectorNumElements();
SDValue A, B;
- SmallVector<int, 16> LMask(NumElts);
- if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!LHS.getOperand(0).isUndef())
- A = LHS.getOperand(0);
- if (!LHS.getOperand(1).isUndef())
- B = LHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS)->getMask();
- llvm::copy(Mask, LMask.begin());
- } else {
- A = LHS;
- for (unsigned i = 0; i != NumElts; ++i)
- LMask[i] = i;
- }
+ SmallVector<int, 16> LMask;
+ GetShuffle(LHS, A, B, LMask);
// Likewise, view RHS in the form
// RHS = VECTOR_SHUFFLE C, D, RMask
SDValue C, D;
- SmallVector<int, 16> RMask(NumElts);
- if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
- if (!RHS.getOperand(0).isUndef())
- C = RHS.getOperand(0);
- if (!RHS.getOperand(1).isUndef())
- D = RHS.getOperand(1);
- ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS)->getMask();
- llvm::copy(Mask, RMask.begin());
- } else {
+ SmallVector<int, 16> RMask;
+ GetShuffle(RHS, C, D, RMask);
+
+ // At least one of the operands should be a vector shuffle.
+ if (LMask.empty() && RMask.empty())
+ return false;
+
+ if (LMask.empty()) {
+ A = LHS;
+ for (unsigned i = 0; i != NumElts; ++i)
+ LMask.push_back(i);
+ }
+
+ if (RMask.empty()) {
C = RHS;
for (unsigned i = 0; i != NumElts; ++i)
- RMask[i] = i;
+ RMask.push_back(i);
}
// If A and B occur in reverse order in RHS, then canonicalize by commuting
@@ -39359,7 +39373,8 @@
(Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
isHorizontalBinOp(LHS, RHS, IsFadd) &&
shouldUseHorizontalOp(LHS == RHS, DAG, Subtarget))
- return DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS);
+ return DAG.getNode(HorizOpcode, SDLoc(N), VT, DAG.getBitcast(VT, LHS),
+ DAG.getBitcast(VT, RHS));
return SDValue();
}
@@ -42261,6 +42276,8 @@
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HADD, DL, Ops[0].getValueType(), Ops);
};
+ Op0 = DAG.getBitcast(VT, Op0);
+ Op1 = DAG.getBitcast(VT, Op1);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HADDBuilder);
}
@@ -42392,6 +42409,8 @@
ArrayRef<SDValue> Ops) {
return DAG.getNode(X86ISD::HSUB, DL, Ops[0].getValueType(), Ops);
};
+ Op0 = DAG.getBitcast(VT, Op0);
+ Op1 = DAG.getBitcast(VT, Op1);
return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {Op0, Op1},
HSUBBuilder);
}
diff --git a/llvm/test/CodeGen/X86/haddsub.ll b/llvm/test/CodeGen/X86/haddsub.ll
index c685d1e..e9a74a32 100644
--- a/llvm/test/CodeGen/X86/haddsub.ll
+++ b/llvm/test/CodeGen/X86/haddsub.ll
@@ -1632,9 +1632,7 @@
; AVX-SLOW-LABEL: PR39936_v8f32:
; AVX-SLOW: # %bb.0:
; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; AVX-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,2,2,3]
; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0
@@ -1646,9 +1644,7 @@
; AVX-FAST-LABEL: PR39936_v8f32:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
+; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX-FAST-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll
index 697c8d7..356227c 100644
--- a/llvm/test/CodeGen/X86/phaddsub.ll
+++ b/llvm/test/CodeGen/X86/phaddsub.ll
@@ -803,32 +803,51 @@
; SSSE3-FAST-NEXT: movd %xmm0, %eax
; SSSE3-FAST-NEXT: retq
;
-; AVX-SLOW-LABEL: PR39936_v8i32:
-; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-SLOW-NEXT: vmovd %xmm0, %eax
-; AVX-SLOW-NEXT: vzeroupper
-; AVX-SLOW-NEXT: retq
+; AVX1-SLOW-LABEL: PR39936_v8i32:
+; AVX1-SLOW: # %bb.0:
+; AVX1-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX1-SLOW-NEXT: vzeroupper
+; AVX1-SLOW-NEXT: retq
;
-; AVX-FAST-LABEL: PR39936_v8i32:
-; AVX-FAST: # %bb.0:
-; AVX-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,2],xmm1[0,2]
-; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0
-; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vmovd %xmm0, %eax
-; AVX-FAST-NEXT: vzeroupper
-; AVX-FAST-NEXT: retq
+; AVX1-FAST-LABEL: PR39936_v8i32:
+; AVX1-FAST: # %bb.0:
+; AVX1-FAST-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX1-FAST-NEXT: vmovd %xmm0, %eax
+; AVX1-FAST-NEXT: vzeroupper
+; AVX1-FAST-NEXT: retq
+;
+; AVX2-SLOW-LABEL: PR39936_v8i32:
+; AVX2-SLOW: # %bb.0:
+; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; AVX2-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vmovd %xmm0, %eax
+; AVX2-SLOW-NEXT: vzeroupper
+; AVX2-SLOW-NEXT: retq
+;
+; AVX2-FAST-LABEL: PR39936_v8i32:
+; AVX2-FAST: # %bb.0:
+; AVX2-FAST-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX2-FAST-NEXT: vmovd %xmm0, %eax
+; AVX2-FAST-NEXT: vzeroupper
+; AVX2-FAST-NEXT: retq
%2 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 undef, i32 undef, i32 undef, i32 undef>
%3 = shufflevector <8 x i32> %0, <8 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
%4 = add <8 x i32> %2, %3
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 4fe07ca..ff3edc3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1732,9 +1732,7 @@
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_0246_1357:
@@ -1775,9 +1773,7 @@
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
-; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
-; AVX1-NEXT: vaddpd %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddpd %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v4f64_4602_5713:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 688ddce..67547f2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -2805,9 +2805,7 @@
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_02468ACE_13579BDF:
@@ -2848,9 +2846,7 @@
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3],ymm0[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
-; AVX1-NEXT: vaddps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vhaddps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: add_v8f32_8ACE0246_9BDF1357: