diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-02-27 12:20:37 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-02-27 12:20:37 +0000 |
commit | ba43ec8702456aa4a435569ca2982c4ec444976f (patch) | |
tree | fde8a0156f300c8104c994a3f774c3f0aecaf4aa | |
parent | c11ae185aa417bf261af83f0bbe6cea14a8e9c01 (diff) | |
download | bcm5719-llvm-ba43ec8702456aa4a435569ca2982c4ec444976f.tar.gz bcm5719-llvm-ba43ec8702456aa4a435569ca2982c4ec444976f.zip |
[X86][AVX] combineLoopMAddPattern - support 256-bit cases on AVX1 via SplitBinaryOpsAndApply
llvm-svn: 326189
-rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/madd.ll | 60 |
2 files changed, 34 insertions, 36 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 4a9e7e7cb41..5449ef788d8 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -37401,7 +37401,7 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, unsigned RegSize = 128; if (Subtarget.useBWIRegs()) RegSize = 512; - else if (Subtarget.hasAVX2()) + else if (Subtarget.hasAVX()) RegSize = 256; unsigned VectorSize = VT.getVectorNumElements() * 16; // If the vector size is less than 128, or greater than the supported RegSize, @@ -37420,7 +37420,13 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1)); // Madd vector size is half of the original vector size - SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1); + auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0, + SDValue Op1) { + MVT VT = MVT::getVectorVT(MVT::i32, Op0.getValueSizeInBits() / 32); + return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Op0, Op1); + }; + SDValue Madd = SplitBinaryOpsAndApply(DAG, Subtarget, DL, MAddVT, N0, N1, + PMADDWDBuilder); // Fill the rest of the output with 0 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL); SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero); diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index d21fd4e0ca4..0e472c19cb0 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -167,33 +167,25 @@ define i32 @_Z10test_shortPsS_i_512(i16* nocapture readonly, i16* nocapture read ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB1_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxwd 8(%rdi,%rcx,2), %xmm2 -; AVX1-NEXT: vpmovsxwd (%rdi,%rcx,2), %xmm3 -; AVX1-NEXT: vpackssdw %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpmovsxwd 24(%rdi,%rcx,2), %xmm3 -; AVX1-NEXT: vpmovsxwd 16(%rdi,%rcx,2), %xmm4 -; AVX1-NEXT: vpackssdw %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpmovsxwd 8(%rsi,%rcx,2), %xmm4 -; AVX1-NEXT: vpmovsxwd (%rsi,%rcx,2), %xmm5 -; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxwd 24(%rsi,%rcx,2), %xmm4 -; AVX1-NEXT: vpmovsxwd 16(%rsi,%rcx,2), %xmm5 -; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vmovdqu (%rdi,%rcx,2), %ymm2 +; AVX1-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5 +; AVX1-NEXT: vpmaddwd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm4 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] @@ -653,25 +645,25 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB4_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm2 -; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm3 +; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2 +; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm3 +; AVX1-NEXT: vpmaddwd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3 ; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4 -; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4 ; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] +; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: addq $16, %rcx ; AVX1-NEXT: cmpq %rcx, %rax ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] |