summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@gmail.com>2016-10-13 05:29:41 +0000
committerCraig Topper <craig.topper@gmail.com>2016-10-13 05:29:41 +0000
commitff23af42992283cc06e65aa1a967330a67a36b74 (patch)
treef89dac3c30e09cb0395e38caae68f3f0d4b89d45 /llvm/lib/Target
parent05242739c202bf4e22fb5ddaab99d04fbd1be404 (diff)
downloadbcm5719-llvm-ff23af42992283cc06e65aa1a967330a67a36b74.tar.gz
bcm5719-llvm-ff23af42992283cc06e65aa1a967330a67a36b74.zip
[AVX-512] Teach shuffle lowering to recognize 512-bit zero extends.
llvm-svn: 284105
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp29
1 files changed, 27 insertions, 2 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d5a9a2ea3c2..36d783771fd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -8151,8 +8151,10 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
InputV = ShuffleOffset(InputV);
// For 256-bit vectors, we only need the lower (128-bit) input half.
- if (VT.is256BitVector())
- InputV = extract128BitVector(InputV, 0, DAG, DL);
+ // For 512-bit vectors, we only need the lower input half or quarter.
+ if (VT.getSizeInBits() > 128)
+ InputV = extractSubVector(InputV, 0, DAG, DL,
+ std::max(128, (int)VT.getSizeInBits() / Scale));
InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
return DAG.getBitcast(VT, InputV);
@@ -12034,6 +12036,14 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v16i32, V1,
+ V2, Mask, Subtarget,
+ DAG))
+ return ZExt;
+
// If the shuffle mask is repeated in each 128-bit lane we can use more
// efficient instructions that mirror the shuffles across the four 128-bit
// lanes.
@@ -12074,6 +12084,14 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v32i16, V1,
+ V2, Mask, Subtarget,
+ DAG))
+ return ZExt;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
@@ -12113,6 +12131,13 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
+ // Whenever we can lower this as a zext, that instruction is strictly faster
+ // than any alternative. It also allows us to fold memory operands into the
+ // shuffle in many cases.
+ if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(DL, MVT::v64i8, V1, V2,
+ Mask, Subtarget, DAG))
+ return ZExt;
+
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
OpenPOWER on IntegriCloud