summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-12-08 14:36:51 +0000
committerAndrea Di Biagio <Andrea_DiBiagio@sn.scee.net>2014-12-08 14:36:51 +0000
commit64bc246f3feb7eb672ba53860c0c1a98c29f0e48 (patch)
tree04731d0e79397f5b73b954775fc9dda4f8ad91d9
parent3519da82b83a7517d3be6b75d8256bfd8e24efbd (diff)
downloadbcm5719-llvm-64bc246f3feb7eb672ba53860c0c1a98c29f0e48.tar.gz
bcm5719-llvm-64bc246f3feb7eb672ba53860c0c1a98c29f0e48.zip
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count.
Before this patch, the backend sub-optimally expanded the non-constant shift count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'. With this patch the backend checks if the target features sse4.1. If so, then it lets the shuffle legalizer deal with the expansion of the shift amount. Example: ;; define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) { %shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer %shl = shl <8 x i16> %A, %shamt ret <8 x i16> %shl } ;; Before (with -mattr=+avx): vmovd %xmm1, %eax movzwl %ax, %eax vmovd %eax, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq Now: vpxor %xmm2, %xmm2, %xmm2 vpblendw $1, %xmm1, %xmm2, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq llvm-svn: 223660
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp30
-rw-r--r--llvm/test/CodeGen/X86/lower-vec-shift-2.ll15
2 files changed, 26 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4c65a6ccd7e..6b4251df61a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16720,18 +16720,28 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
}
- // Need to build a vector containing shift amount.
- // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
- SmallVector<SDValue, 4> ShOps;
- ShOps.push_back(ShAmt);
- if (SVT == MVT::i32) {
- ShOps.push_back(DAG.getConstant(0, SVT));
+ const X86Subtarget &Subtarget =
+ DAG.getTarget().getSubtarget<X86Subtarget>();
+ if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
+ ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
+ // Let the shuffle legalizer expand this shift amount node.
+ SDValue Op0 = ShAmt.getOperand(0);
+ Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
+ ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, &Subtarget, DAG);
+ } else {
+ // Need to build a vector containing shift amount.
+ // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
+ SmallVector<SDValue, 4> ShOps;
+ ShOps.push_back(ShAmt);
+ if (SVT == MVT::i32) {
+ ShOps.push_back(DAG.getConstant(0, SVT));
+ ShOps.push_back(DAG.getUNDEF(SVT));
+ }
ShOps.push_back(DAG.getUNDEF(SVT));
- }
- ShOps.push_back(DAG.getUNDEF(SVT));
- MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
- ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
+ ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, BVT, ShOps);
+ }
// The return type has to be a 128-bit type with the same element
// type as the input type.
diff --git a/llvm/test/CodeGen/X86/lower-vec-shift-2.ll b/llvm/test/CodeGen/X86/lower-vec-shift-2.ll
index 90505b6dd8f..770775d3242 100644
--- a/llvm/test/CodeGen/X86/lower-vec-shift-2.ll
+++ b/llvm/test/CodeGen/X86/lower-vec-shift-2.ll
@@ -11,9 +11,8 @@ define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
; SSE2-NEXT: retq
; AVX-LABEL: test1:
; AVX: # BB#0
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -66,9 +65,8 @@ define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
; SSE2-NEXT: retq
; AVX-LABEL: test4:
; AVX: # BB#0
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
@@ -121,9 +119,8 @@ define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
; SSE2-NEXT: retq
; AVX-LABEL: test7:
; AVX: # BB#0
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: movzwl %ax, %eax
-; AVX-NEXT: vmovd %eax, %xmm1
+; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
entry:
OpenPOWER on IntegriCloud