summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@gmail.com>2016-08-07 21:52:59 +0000
committerCraig Topper <craig.topper@gmail.com>2016-08-07 21:52:59 +0000
commitf44423120fa98e2e0e2e98b6e6daa0db6213a7f0 (patch)
tree14e836ad38309ba92dc3a202f59268aad26a60fa
parentda178822c240b7d12f21cbd5a9e700d56078ad10 (diff)
downloadbcm5719-llvm-f44423120fa98e2e0e2e98b6e6daa0db6213a7f0.tar.gz
bcm5719-llvm-f44423120fa98e2e0e2e98b6e6daa0db6213a7f0.zip
[AVX-512] Improve lowering of inserting a single element into lowest element of a 512-bit vector of zeroes by using vmovq/vmovd/vmovss/vmovsd.
llvm-svn: 277965
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp18
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td8
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll11
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll18
4 files changed, 35 insertions, 20 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 47a66c55b80..1fb398386e9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6691,12 +6691,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
- if (VT.is512BitVector()) {
- SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
- return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
- Item, DAG.getIntPtrConstant(0, dl));
- }
- assert((VT.is128BitVector() || VT.is256BitVector()) &&
+ assert((VT.is128BitVector() || VT.is256BitVector() ||
+ VT.is512BitVector()) &&
"Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
@@ -12091,6 +12087,16 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Subtarget.hasAVX512() &&
"Cannot lower 512-bit vectors w/ basic ISA!");
+ // If we have a single input to the zero element, insert that into V1 if we
+ // can do so cheaply.
+ int NumElts = Mask.size();
+ int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
+
+ if (NumV2Elements == 1 && Mask[0] >= NumElts)
+ if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
+ DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return Insertion;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast =
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 893078c299c..86afd627ed8 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3305,6 +3305,10 @@ let Predicates = [HasAVX512] in {
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
+
+ def : Pat<(v8i64 (X86vzmovl (insert_subvector undef,
+ (v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i64 0), (VMOV64toPQIZrr GR64:$src), sub_xmm)>;
}
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
let AddedComplexity = 20 in {
@@ -3330,6 +3334,10 @@ let Predicates = [HasAVX512] in {
(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
(SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+ def : Pat<(v16i32 (X86vzmovl (insert_subvector undef,
+ (v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
+ (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src), sub_xmm)>;
+
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
def : Pat<(v8i64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i64 0), (VMOVZPQILo2PQIZrm addr:$src), sub_xmm)>;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 6cd0366d5ad..7172d53c78f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -297,3 +297,14 @@ define <16 x float> @shuffle_v16f32_00_01_10_10_04_05_14_14_08_09_18_18_0c_0d_1c
%shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
ret <16 x float> %shuffle
}
+
+define <16 x i32> @insert_mem_and_zero_v16i32(i32* %ptr) {
+; ALL-LABEL: insert_mem_and_zero_v16i32:
+; ALL: # BB#0:
+; ALL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT: retq
+ %a = load i32, i32* %ptr
+ %v = insertelement <16 x i32> undef, i32 %a, i32 0
+ %shuffle = shufflevector <16 x i32> %v, <16 x i32> zeroinitializer, <16 x i32> <i32 0, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <16 x i32> %shuffle
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 6e2df62d5ed..c48821b639f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -147,20 +147,15 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; AVX512F-LABEL: shuffle_v8f64_70000000:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: movl $7, %eax
-; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8f64_70000000:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: movl $7, %eax
-; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
-; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; AVX512F-32-NEXT: vmovd %eax, %xmm1
; AVX512F-32-NEXT: vpermpd %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1116,20 +1111,15 @@ define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
;
; AVX512F-LABEL: shuffle_v8i64_70000000:
; AVX512F: # BB#0:
-; AVX512F-NEXT: vpxord %zmm1, %zmm1, %zmm1
; AVX512F-NEXT: movl $7, %eax
-; AVX512F-NEXT: vpinsrq $0, %rax, %xmm1, %xmm2
-; AVX512F-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vmovq %rax, %xmm1
; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-NEXT: retq
;
; AVX512F-32-LABEL: shuffle_v8i64_70000000:
; AVX512F-32: # BB#0:
-; AVX512F-32-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-32-NEXT: movl $7, %eax
-; AVX512F-32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1
-; AVX512F-32-NEXT: vpxord %zmm2, %zmm2, %zmm2
-; AVX512F-32-NEXT: vinserti32x4 $0, %xmm1, %zmm2, %zmm1
+; AVX512F-32-NEXT: vmovd %eax, %xmm1
; AVX512F-32-NEXT: vpermq %zmm0, %zmm1, %zmm0
; AVX512F-32-NEXT: retl
%shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
OpenPOWER on IntegriCloud