From b1cfcd1a5667c55fbcd96fd4bd49db70ce393856 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 2 Aug 2019 23:43:53 +0000 Subject: [ScalarizeMaskedMemIntrin] Bitcast the mask to the scalar domain and use scalar bit tests for the branches for expandload/compressstore. Same as what was done for gather/scatter/load/store in r367489. Expandload/compressstore were delayed due to lack of constant masking handling that has since been fixed. llvm-svn: 367738 --- llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp | 35 ++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp') diff --git a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp index 71bd0fe3229..515582640ed 100644 --- a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -634,6 +634,14 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) { return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // @@ -642,8 +650,14 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) { // br i1 %mask_1, label %cond.load, label %else // - Value *Predicate = - Builder.CreateExtractElement(Mask, Idx); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + } // Create "cond" block // @@ -728,13 +742,28 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) { return; } + // If the mask is not v1i1, use scalar bit test operations. This generates + // better results on X86 at least. + Value *SclrMask; + if (VectorWidth != 1) { + Type *SclrMaskTy = Builder.getIntNTy(VectorWidth); + SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask"); + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { // Fill the "else" block, created in the previous iteration // // %mask_1 = extractelement <16 x i1> %mask, i32 Idx // br i1 %mask_1, label %cond.store, label %else // - Value *Predicate = Builder.CreateExtractElement(Mask, Idx); + Value *Predicate; + if (VectorWidth != 1) { + Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx)); + Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask), + Builder.getIntN(VectorWidth, 0)); + } else { + Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx)); + } // Create "cond" block // -- cgit v1.2.3