diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-10-02 15:59:15 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2016-10-02 15:59:15 +0000 |
| commit | 03afbe783d64d55ee8ef4b69cde96f1e2ce7183e (patch) | |
| tree | 10e6a8fcffef5148e829f5ec69b5831c202c30f6 /llvm | |
| parent | 6b582bf91f134f95bd5100beb03db9464769f1ef (diff) | |
| download | bcm5719-llvm-03afbe783d64d55ee8ef4b69cde96f1e2ce7183e.tar.gz bcm5719-llvm-03afbe783d64d55ee8ef4b69cde96f1e2ce7183e.zip | |
[X86][AVX] Ensure broadcast loads respect dependencies
To allow broadcast loads of a non-zero'th vector element, lowerVectorShuffleAsBroadcast can replace a load with a new load with an adjusted address, but unfortunately we weren't ensuring that the new load respected the same dependencies.
This patch adds a TokenFactor and updates all dependencies of the old load to reference the new load instead.
Bug found during internal testing.
Differential Revision: https://reviews.llvm.org/D25039
llvm-svn: 283070
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.cpp | 11 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-vbroadcast.ll | 20 |
2 files changed, 22 insertions, 9 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1bece04a38d..674aa257db1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -8683,6 +8683,17 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, DAG.getMachineFunction().getMachineMemOperand( Ld->getMemOperand(), Offset, SVT.getStoreSize())); + + // Make sure the newly-created LOAD is in the same position as Ld in + // terms of dependency. We create a TokenFactor for Ld and V, + // and update uses of Ld's output chain to use the TokenFactor. + if (Ld->hasAnyUseOfValue(1)) { + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(Ld, 1), SDValue(V.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); + DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1), + SDValue(V.getNode(), 1)); + } } else if (!BroadcastFromReg) { // We can't broadcast from a vector register. return SDValue(); diff --git a/llvm/test/CodeGen/X86/avx-vbroadcast.ll b/llvm/test/CodeGen/X86/avx-vbroadcast.ll index 766294cd5e1..0cd236da24a 100644 --- a/llvm/test/CodeGen/X86/avx-vbroadcast.ll +++ b/llvm/test/CodeGen/X86/avx-vbroadcast.ll @@ -548,38 +548,40 @@ define <4 x double> @splat_concat4(double* %p) { } ; -; FIXME: When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. +; When VBROADCAST replaces an existing load, ensure it still respects lifetime dependencies. ; define float @broadcast_lifetime() nounwind { ; X32-LABEL: broadcast_lifetime: ; X32: ## BB#0: ; X32-NEXT: pushl %esi -; X32-NEXT: subl $40, %esp +; X32-NEXT: subl $56, %esp ; X32-NEXT: leal {{[0-9]+}}(%esp), %esi ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc +; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill ; X32-NEXT: movl %esi, (%esp) ; X32-NEXT: calll _gfunc ; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 -; X32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 -; X32-NEXT: vsubss %xmm0, %xmm1, %xmm0 +; X32-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 16-byte Folded Reload ; X32-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: flds {{[0-9]+}}(%esp) -; X32-NEXT: addl $40, %esp +; X32-NEXT: addl $56, %esp ; X32-NEXT: popl %esi ; X32-NEXT: retl ; ; X64-LABEL: broadcast_lifetime: ; X64: ## BB#0: -; X64-NEXT: subq $24, %rsp +; X64-NEXT: subq $40, %rsp ; X64-NEXT: movq %rsp, %rdi ; X64-NEXT: callq _gfunc +; X64-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm0 +; X64-NEXT: vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill ; X64-NEXT: movq %rsp, %rdi ; X64-NEXT: callq _gfunc ; X64-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm0 -; X64-NEXT: vbroadcastss {{[0-9]+}}(%rsp), %xmm1 -; X64-NEXT: vsubss %xmm0, %xmm1, %xmm0 -; X64-NEXT: addq $24, %rsp +; X64-NEXT: vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload +; X64-NEXT: addq $40, %rsp ; X64-NEXT: retq %1 = alloca <4 x float>, align 16 %2 = alloca <4 x float>, align 16 |

