summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2018-05-31 15:47:17 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2018-05-31 15:47:17 +0000
commitff0623cd29545720177439c3e65df2e2ac07db87 (patch)
tree693e8e9d161c03424adc59f3743d53ba76f4657b
parentc34395d889afed7d9e89ba33d26125dc96bfb240 (diff)
downloadbcm5719-llvm-ff0623cd29545720177439c3e65df2e2ac07db87.tar.gz
bcm5719-llvm-ff0623cd29545720177439c3e65df2e2ac07db87.zip
[X86][SSE] Recognise splat rotations and expand back to shift ops.
Noticed while fixing PR37426, for splat rotations (rotation by an uniform value) its better to just expand back to shift ops than performing as a general non-uniform rotation. llvm-svn: 333661
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp12
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-128.ll87
-rw-r--r--llvm/test/CodeGen/X86/vector-rotate-256.ll36
3 files changed, 60 insertions, 75 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 7d1fd037869..99cc2da97af 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23133,6 +23133,8 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
SelectionDAG &DAG, const X86Subtarget &Subtarget,
unsigned Opcode) {
+ V = peekThroughEXTRACT_SUBVECTORs(V);
+
// Check if this is a splat build_vector node.
if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
SDValue SplatAmt = BV->getSplatValue();
@@ -23792,6 +23794,16 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // Rotate by splat - expand back to shifts.
+ // TODO - legalizers should be able to handle this.
+ if (IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
+ SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
+ AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
+ SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
+ SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
+ return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
+ }
+
// AVX2 - best to fallback to variable shifts.
// TODO - legalizers should be able to handle this.
if (Subtarget.hasAVX2()) {
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index a877533128a..2b97b89e936 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -831,55 +831,42 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
; SSE2-LABEL: splatvar_rotate_v4i32:
; SSE2: # %bb.0:
+; SSE2-NEXT: xorps %xmm2, %xmm2
+; SSE2-NEXT: xorps %xmm3, %xmm3
+; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE2-NEXT: pslld $23, %xmm1
-; SSE2-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm1, %xmm0
-; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; SSE2-NEXT: pmuludq %xmm2, %xmm1
-; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT: por %xmm3, %xmm0
+; SSE2-NEXT: movdqa %xmm0, %xmm4
+; SSE2-NEXT: pslld %xmm3, %xmm4
+; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
+; SSE2-NEXT: psubd %xmm1, %xmm3
+; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; SSE2-NEXT: psrld %xmm2, %xmm0
+; SSE2-NEXT: por %xmm4, %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: splatvar_rotate_v4i32:
; SSE41: # %bb.0:
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; SSE41-NEXT: pslld $23, %xmm1
-; SSE41-NEXT: paddd {{.*}}(%rip), %xmm1
-; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
-; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; SSE41-NEXT: pmuludq %xmm2, %xmm3
-; SSE41-NEXT: pmuludq %xmm1, %xmm0
-; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; SSE41-NEXT: por %xmm1, %xmm0
+; SSE41-NEXT: movdqa %xmm0, %xmm3
+; SSE41-NEXT: pslld %xmm2, %xmm3
+; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32]
+; SSE41-NEXT: psubd %xmm1, %xmm2
+; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT: psrld %xmm1, %xmm0
+; SSE41-NEXT: por %xmm3, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: splatvar_rotate_v4i32:
; AVX1: # %bb.0:
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpslld %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [32,32,32,32]
+; AVX1-NEXT: vpsubd %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_rotate_v4i32:
@@ -922,21 +909,17 @@ define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
;
; X32-SSE-LABEL: splatvar_rotate_v4i32:
; X32-SSE: # %bb.0:
+; X32-SSE-NEXT: xorps %xmm2, %xmm2
+; X32-SSE-NEXT: xorps %xmm3, %xmm3
+; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X32-SSE-NEXT: pslld $23, %xmm1
-; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT: cvttps2dq %xmm1, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm1, %xmm0
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X32-SSE-NEXT: pmuludq %xmm2, %xmm1
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; X32-SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X32-SSE-NEXT: por %xmm3, %xmm0
+; X32-SSE-NEXT: movdqa %xmm0, %xmm4
+; X32-SSE-NEXT: pslld %xmm3, %xmm4
+; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32]
+; X32-SSE-NEXT: psubd %xmm1, %xmm3
+; X32-SSE-NEXT: movss {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3]
+; X32-SSE-NEXT: psrld %xmm2, %xmm0
+; X32-SSE-NEXT: por %xmm4, %xmm0
; X32-SSE-NEXT: retl
%splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
%splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index a048c0c5689..a32fc6305e9 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -506,29 +506,19 @@ define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
; AVX1-LABEL: splatvar_rotate_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
-; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
-; AVX1-NEXT: vpor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpslld %xmm1, %xmm3, %xmm4
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [32,32,32,32]
+; AVX1-NEXT: vpsubd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT: vpsrld %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpor %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm1
+; AVX1-NEXT: vpsrld %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: splatvar_rotate_v8i32:
OpenPOWER on IntegriCloud