[x86, MemCmpExpansion] allow 2 pairs of loads per block (PR33325)

This is the last step needed to fix PR33325: https://bugs.llvm.org/show_bug.cgi?id=33325 We're trading branch and compares for loads and logic ops. This makes the code smaller and hopefully faster in most cases. The 24-byte test shows an interesting construct: we load the trailing scalar elements into vector registers and generate the same pcmpeq+movmsk code that we expected for a pair of full vector elements (see the 32- and 64-byte tests). Differential Revision: https://reviews.llvm.org/D41714 llvm-svn: 321934
author: Sanjay Patel <spatel@rotateright.com> 2018-01-06 16:16:04 +0000
committer: Sanjay Patel <spatel@rotateright.com> 2018-01-06 16:16:04 +0000
commit: 5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe (patch)
tree: b3339387d653e097d5942262ae56c44ece95f984 /llvm/lib
parent: b77bc6bb8b1df9b05a9cda0555d3c58655aba5ae (diff)
download: bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.tar.gz
bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.zip
2 files changed, 7 insertions, 6 deletions
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 20a240fd344..d73e2c4670b 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -564,12 +564,8 @@ Value *MemCmpExpansion::getMemCmpOneBlock() {
 // This function expands the memcmp call into an inline expansion and returns
 // the memcmp result.
 Value *MemCmpExpansion::getMemCmpExpansion() {
-  // A memcmp with zero-comparison with only one block of load and compare does
-  // not need to set up any extra blocks. This case could be handled in the DAG,
-  // but since we have all of the machinery to flexibly expand any memcpy here,
-  // we choose to handle this case too to avoid fragmented lowering.
-  if ((!IsUsedForZeroCmp && NumLoadsPerBlockForZeroCmp != 1) ||
-      getNumBlocks() != 1) {
+  // Create the basic block framework for a multi-block expansion.
+  if (getNumBlocks() != 1) {
     BasicBlock *StartBlock = CI->getParent();
     EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
     setupEndBlockPHINodes();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 1fb7c7ed4e9..c540f29f165 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -829,6 +829,11 @@ namespace llvm {
     /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
     MVT hasFastEqualityCompare(unsigned NumBits) const override;
 
+    /// Allow multiple load pairs per block for smaller and faster code.
+    unsigned getMemcmpEqZeroLoadsPerBlock() const override {
+      return 2;
+    }
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
author	Sanjay Patel <spatel@rotateright.com>	2018-01-06 16:16:04 +0000
committer	Sanjay Patel <spatel@rotateright.com>	2018-01-06 16:16:04 +0000
commit	5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe (patch)
tree	b3339387d653e097d5942262ae56c44ece95f984 /llvm/lib
parent	b77bc6bb8b1df9b05a9cda0555d3c58655aba5ae (diff)
download	bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.tar.gz bcm5719-llvm-5a48aef3f0dbb0934e266dbd068ff46dff5c4dbe.zip