summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2019-08-07 21:16:10 +0000
committerCraig Topper <craig.topper@intel.com>2019-08-07 21:16:10 +0000
commit7f7ef0208b57a4d253cd8b07053460f40ad7cbc8 (patch)
treeb05475a04ead756451dbe459b9b9d518663260b1
parentd47be4da5ab2dd68e89701b2f3c9b355ab95ef36 (diff)
downloadbcm5719-llvm-7f7ef0208b57a4d253cd8b07053460f40ad7cbc8.tar.gz
bcm5719-llvm-7f7ef0208b57a4d253cd8b07053460f40ad7cbc8.zip
[X86] Allow pack instructions to be used for 512->256 truncates when -mprefer-vector-width=256 is causing 512-bit vectors to be split
If we're splitting the 512-bit vector anyway and we have zero/sign bits, then we might as well use pack instructions to concat and truncate at once. Differential Revision: https://reviews.llvm.org/D65904 llvm-svn: 368210
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp11
-rw-r--r--llvm/test/CodeGen/X86/min-legal-vector-width.ll33
2 files changed, 22 insertions, 22 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e18b55b9629..368f409394e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -40898,8 +40898,8 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- // Requires SSE2 but AVX512 has fast truncate.
- if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ // Requires SSE2.
+ if (!Subtarget.hasSSE2())
return SDValue();
if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
@@ -40923,6 +40923,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
+ // AVX512 has fast truncate, but if the input is already going to be split,
+ // there's no harm in trying pack.
+ if (Subtarget.hasAVX512() &&
+ !(!Subtarget.useAVX512Regs() && VT.is256BitVector() &&
+ InVT.is512BitVector()))
+ return SDValue();
+
unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index b75fd2d8e05..40d557afe2b 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -757,9 +757,8 @@ define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-ve
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0
; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1
-; CHECK-NEXT: vpmovqd %ymm1, %xmm1
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = lshr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -770,11 +769,9 @@ define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-ve
define <16 x i16> @trunc_v16i32_v16i16_zeroes(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" {
; CHECK-LABEL: trunc_v16i32_v16i16_zeroes:
; CHECK: # %bb.0:
-; CHECK-NEXT: vpsrld $16, 32(%rdi), %ymm0
-; CHECK-NEXT: vpsrld $16, (%rdi), %ymm1
-; CHECK-NEXT: vpmovdw %ymm1, %xmm1
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vmovdqa (%rdi), %ymm1
+; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31]
+; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
%b = lshr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -787,9 +784,8 @@ define <32 x i8> @trunc_v32i16_v32i8_zeroes(<32 x i16>* %x) nounwind "min-legal-
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrlw $8, 32(%rdi), %ymm0
; CHECK-NEXT: vpsrlw $8, (%rdi), %ymm1
-; CHECK-NEXT: vpmovwb %ymm1, %xmm1
-; CHECK-NEXT: vpmovwb %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpackuswb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <32 x i16>, <32 x i16>* %x
%b = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
@@ -802,9 +798,8 @@ define <8 x i32> @trunc_v8i64_v8i32_sign(<8 x i64>* %x) nounwind "min-legal-vect
; CHECK: # %bb.0:
; CHECK-NEXT: vpsraq $48, 32(%rdi), %ymm0
; CHECK-NEXT: vpsraq $48, (%rdi), %ymm1
-; CHECK-NEXT: vpmovqd %ymm1, %xmm1
-; CHECK-NEXT: vpmovqd %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <8 x i64>, <8 x i64>* %x
%b = ashr <8 x i64> %a, <i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48, i64 48>
@@ -817,9 +812,8 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-
; CHECK: # %bb.0:
; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0
; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1
-; CHECK-NEXT: vpmovdw %ymm1, %xmm1
-; CHECK-NEXT: vpmovdw %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <16 x i32>, <16 x i32>* %x
%b = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
@@ -832,9 +826,8 @@ define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-ve
; CHECK: # %bb.0:
; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0
; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1
-; CHECK-NEXT: vpmovwb %ymm1, %xmm1
-; CHECK-NEXT: vpmovwb %ymm0, %xmm0
-; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; CHECK-NEXT: retq
%a = load <32 x i16>, <32 x i16>* %x
%b = ashr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
OpenPOWER on IntegriCloud