diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 33 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pr43866.ll | 37 |
2 files changed, 51 insertions, 19 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index c7a45f65e98..2862b7aa3b5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5324,15 +5324,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, static bool canWidenShuffleElements(ArrayRef<int> Mask, const APInt &Zeroable, + bool V2IsZero, SmallVectorImpl<int> &WidenedMask) { - SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end()); - for (int i = 0, Size = TargetMask.size(); i < Size; ++i) { - if (TargetMask[i] == SM_SentinelUndef) - continue; - if (Zeroable[i]) - TargetMask[i] = SM_SentinelZero; + // Create an alternative mask with info about zeroable elements. + // Here we do not set undef elements as zeroable. + SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end()); + if (V2IsZero) { + assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); + for (int i = 0, Size = Mask.size(); i != Size; ++i) + if (Mask[i] != SM_SentinelUndef && Zeroable[i]) + ZeroableMask[i] = SM_SentinelZero; } - return canWidenShuffleElements(TargetMask, WidenedMask); + return canWidenShuffleElements(ZeroableMask, WidenedMask); } static bool canWidenShuffleElements(ArrayRef<int> Mask) { @@ -14817,8 +14820,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, if (Subtarget.hasAVX2() && V2.isUndef()) return SDValue(); + bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); + SmallVector<int, 4> WidenedMask; - if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask)) + if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) return SDValue(); bool IsLowZero = (Zeroable & 0x3) == 0x3; @@ -17095,23 +17100,13 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget, bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); - // Create an alternative mask with info about zeroable elements. - // Here we do not set undef elements as zeroable. - SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end()); - if (V2IsZero) { - assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!"); - for (int i = 0; i != NumElements; ++i) - if (OrigMask[i] != SM_SentinelUndef && Zeroable[i]) - ZeroableMask[i] = SM_SentinelZero; - } - // Try to collapse shuffles into using a vector type with fewer elements but // wider element types. We cap this to not form integers or floating point // elements wider than 64 bits, but it might be interesting to form i128 // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector<int, 16> WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - canWidenShuffleElements(ZeroableMask, WidenedMask)) { + canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. // TODO: Avoid lowering directly from this top-level function: make this diff --git a/llvm/test/CodeGen/X86/pr43866.ll b/llvm/test/CodeGen/X86/pr43866.ll new file mode 100644 index 00000000000..a430975c47d --- /dev/null +++ b/llvm/test/CodeGen/X86/pr43866.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s + +@v2_0 = global <2 x i32> zeroinitializer, align 8 + +define void @test() { +; CHECK-LABEL: test: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq %rsp, %rbp +; CHECK-NEXT: .cfi_def_cfa_register %rbp +; CHECK-NEXT: andq $-32, %rsp +; CHECK-NEXT: subq $64, %rsp +; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; CHECK-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4] +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[0,0],ymm1[6,4],ymm0[4,4] +; CHECK-NEXT: vmovaps %ymm0, (%rsp) +; CHECK-NEXT: movq %rbp, %rsp +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq +entry: + %v8_0 = alloca <8 x i32>, align 32 + %v8_0.0.v8_0.0..sroa_cast = bitcast <8 x i32>* %v8_0 to i8* + %0 = load <2 x i32>, <2 x i32>* @v2_0, align 8 + %shuffle = shufflevector <2 x i32> %0, <2 x i32> <i32 -1, i32 -1>, <8 x i32> <i32 1, i32 3, i32 0, i32 0, i32 3, i32 3, i32 2, i32 2> + store volatile <8 x i32> %shuffle, <8 x i32>* %v8_0, align 32 + ret void +} |

