diff options
| author | Sanjay Patel <spatel@rotateright.com> | 2018-01-06 16:16:04 +0000 |
|---|---|---|
| committer | Sanjay Patel <spatel@rotateright.com> | 2018-01-06 16:16:04 +0000 |
| commit | 5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe (patch) | |
| tree | b3339387d653e097d5942262ae56c44ece95f984 /llvm/lib | |
| parent | b77bc6bb8b1df9b05a9cda0555d3c58655aba5ae (diff) | |
| download | bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.tar.gz bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.zip | |
[x86, MemCmpExpansion] allow 2 pairs of loads per block (PR33325)
This is the last step needed to fix PR33325:
https://bugs.llvm.org/show_bug.cgi?id=33325
We're trading branch and compares for loads and logic ops.
This makes the code smaller and hopefully faster in most cases.
The 24-byte test shows an interesting construct: we load the trailing scalar
elements into vector registers and generate the same pcmpeq+movmsk code that
we expected for a pair of full vector elements (see the 32- and 64-byte tests).
Differential Revision: https://reviews.llvm.org/D41714
llvm-svn: 321934
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/CodeGen/ExpandMemCmp.cpp | 8 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ISelLowering.h | 5 |
2 files changed, 7 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 20a240fd344..d73e2c4670b 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -564,12 +564,8 @@ Value *MemCmpExpansion::getMemCmpOneBlock() { // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion() { - // A memcmp with zero-comparison with only one block of load and compare does - // not need to set up any extra blocks. This case could be handled in the DAG, - // but since we have all of the machinery to flexibly expand any memcpy here, - // we choose to handle this case too to avoid fragmented lowering. - if ((!IsUsedForZeroCmp && NumLoadsPerBlockForZeroCmp != 1) || - getNumBlocks() != 1) { + // Create the basic block framework for a multi-block expansion. + if (getNumBlocks() != 1) { BasicBlock *StartBlock = CI->getParent(); EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); setupEndBlockPHINodes(); diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 1fb7c7ed4e9..c540f29f165 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -829,6 +829,11 @@ namespace llvm { /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST. MVT hasFastEqualityCompare(unsigned NumBits) const override; + /// Allow multiple load pairs per block for smaller and faster code. + unsigned getMemcmpEqZeroLoadsPerBlock() const override { + return 2; + } + /// Return the value type to use for ISD::SETCC. EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override; |

