diff options
| -rw-r--r-- | llvm/include/llvm/CodeGen/TargetLowering.h | 15 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 23 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 13 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll | 24 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll | 16 | 
8 files changed, 59 insertions, 51 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 8b70075d3ae..d5cca60bb1b 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -401,8 +401,9 @@ public:    /// efficiently, casting the load to a smaller vector of larger types and    /// loading is more efficient, however, this can be undone by optimizations in    /// dag combiner. -  virtual bool isLoadBitCastBeneficial(EVT LoadVT, -                                       EVT BitcastVT) const { +  virtual bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, +                                       const SelectionDAG &DAG, +                                       const MachineMemOperand &MMO) const {      // Don't do if we could do an indexed load on the original type, but not on      // the new one.      if (!LoadVT.isSimple() || !BitcastVT.isSimple()) @@ -416,14 +417,18 @@ public:          getTypeToPromoteTo(ISD::LOAD, LoadMVT) == BitcastVT.getSimpleVT())        return false; -    return true; +    bool Fast = false; +    return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), BitcastVT, +                              MMO, &Fast) && Fast;    }    /// Return true if the following transform is beneficial:    /// (store (y (conv x)), y*)) -> (store x, (x*)) -  virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT) const { +  virtual bool isStoreBitCastBeneficial(EVT StoreVT, EVT BitcastVT, +                                        const SelectionDAG &DAG, +                                        const MachineMemOperand &MMO) const {      // Default to the same logic as loads. -    return isLoadBitCastBeneficial(StoreVT, BitcastVT); +    return isLoadBitCastBeneficial(StoreVT, BitcastVT, DAG, MMO);    }    /// Return true if it is expected to be cheaper to do a store of a non-zero diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index cd5de4c1400..09e1195e1fd 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -11040,14 +11040,11 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {        // as we assume software couldn't rely on the number of accesses of an        // illegal type.        ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) || -       TLI.isOperationLegal(ISD::LOAD, VT)) && -      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) { +       TLI.isOperationLegal(ISD::LOAD, VT))) {      LoadSDNode *LN0 = cast<LoadSDNode>(N0); -    bool Fast = false; -    if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, -                               *LN0->getMemOperand(), &Fast) && -        Fast) { +    if (TLI.isLoadBitCastBeneficial(N0.getValueType(), VT, DAG, +                                    *LN0->getMemOperand())) {        SDValue Load =            DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),                        LN0->getPointerInfo(), LN0->getAlignment(), @@ -16174,15 +16171,11 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {      // illegal type.      if (((!LegalOperations && !ST->isVolatile()) ||           TLI.isOperationLegal(ISD::STORE, SVT)) && -        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) { -      bool Fast = false; -      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT, -                                 *ST->getMemOperand(), &Fast) && -          Fast) { -        return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, -                            ST->getPointerInfo(), ST->getAlignment(), -                            ST->getMemOperand()->getFlags(), ST->getAAInfo()); -      } +        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT, +                                     DAG, *ST->getMemOperand())) { +      return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr, +                          ST->getPointerInfo(), ST->getAlignment(), +                          ST->getMemOperand()->getFlags(), ST->getAAInfo());      }    } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 766294dee23..0ccd58d44aa 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -719,8 +719,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,    return (OldSize < 32);  } -bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, -                                                   EVT CastTy) const { +bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy, +                                                   const SelectionDAG &DAG, +                                                   const MachineMemOperand &MMO) const {    assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits()); @@ -730,8 +731,12 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy,    unsigned LScalarSize = LoadTy.getScalarSizeInBits();    unsigned CastScalarSize = CastTy.getScalarSizeInBits(); -  return (LScalarSize < CastScalarSize) || -         (CastScalarSize >= 32); +  if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32)) +    return false; + +  bool Fast = false; +  return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy, +                            MMO, &Fast) && Fast;  }  // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 9723fc3ec66..40ff24f0754 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -182,7 +182,8 @@ public:                               ISD::LoadExtType ExtType,                               EVT ExtVT) const override; -  bool isLoadBitCastBeneficial(EVT, EVT) const final; +  bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, +                               const MachineMemOperand &MMO) const final;    bool storeOfVectorConstantIsCheap(EVT MemVT,                                      unsigned NumElem, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 370ecefd273..3cab44b0ac1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4941,8 +4941,9 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {    return Subtarget.hasLZCNT();  } -bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, -                                                EVT BitcastVT) const { +bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, +                                                const SelectionDAG &DAG, +                                                const MachineMemOperand &MMO) const {    if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() &&        BitcastVT.getVectorElementType() == MVT::i1)      return false; @@ -4950,7 +4951,12 @@ bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,    if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8)      return false; -  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT); +  // If both types are legal vectors, it's always ok to convert them. +  if (LoadVT.isVector() && BitcastVT.isVector() && +      isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) +    return true; + +  return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO);  }  bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0631fb7dfe8..e0be03bc3f9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1127,7 +1127,9 @@ namespace llvm {        return NumElem > 2;      } -    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT) const override; +    bool isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, +                                 const SelectionDAG &DAG, +                                 const MachineMemOperand &MMO) const override;      /// Intel processors have a unified instruction and data cache      const char * getClearCacheBuiltinName() const override { diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll index b742cab5448..8df92e76def 100644 --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -306,27 +306,25 @@ define void @merge_2_v4f32_align1_ntstore(<4 x float>* %a0, <4 x float>* %a1) no  ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, (%eax) -; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm2, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]  ; X86-SSE2-NEXT:    movd %xmm2, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax) -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)  ; X86-SSE2-NEXT:    movd %xmm1, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 16(%eax) -; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax) -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X86-SSE2-NEXT:    movd %xmm1, %ecx +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)  ; X86-SSE2-NEXT:    retl  ; @@ -421,27 +419,25 @@ define void @merge_2_v4f32_align1(<4 x float>* %a0, <4 x float>* %a1) nounwind {  ; X86-SSE2-NEXT:    movdqu 16(%ecx), %xmm1  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, (%eax) -; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2 -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[3,1],xmm0[2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm2, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 12(%eax)  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]  ; X86-SSE2-NEXT:    movd %xmm2, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 8(%eax) -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 4(%eax)  ; X86-SSE2-NEXT:    movd %xmm1, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 16(%eax) -; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0 -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1],xmm1[2,3] +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 28(%eax)  ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]  ; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 24(%eax) -; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X86-SSE2-NEXT:    movd %xmm1, %ecx +; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; X86-SSE2-NEXT:    movd %xmm0, %ecx  ; X86-SSE2-NEXT:    movntil %ecx, 20(%eax)  ; X86-SSE2-NEXT:    retl  ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll index f37cd88101b..f4a4bb5e6b4 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1,12 +1,12 @@  ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1OR2 --check-prefix=AVX1 +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-shuffle | FileCheck %s --check-prefixes=ALL,AVX,AVX2OR512VL,AVX512VL  define <4 x i32> @shuffle_v4i32_0001(<4 x i32> %a, <4 x i32> %b) {  ; SSE-LABEL: shuffle_v4i32_0001:  | 

