diff options
author | Benjamin Kramer <benny.kra@googlemail.com> | 2012-12-18 18:40:20 +0000 |
---|---|---|
committer | Benjamin Kramer <benny.kra@googlemail.com> | 2012-12-18 18:40:20 +0000 |
commit | f0e5d2f032496b9484c7634cb127b2d645d4b337 (patch) | |
tree | aef57a2bccdb563e43c456e166f8ef630f908658 /llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | 39e7c6e370ffaf941503cd8b3f54767e4e7fa886 (diff) | |
download | bcm5719-llvm-f0e5d2f032496b9484c7634cb127b2d645d4b337.tar.gz bcm5719-llvm-f0e5d2f032496b9484c7634cb127b2d645d4b337.zip |
LoopVectorize: Emit reductions as log2(vectorsize) shuffles + vector ops instead of scalar operations.
For example on x86 with SSE4.2 a <8 x i8> add reduction becomes
movdqa %xmm0, %xmm1
movhlps %xmm1, %xmm1 ## xmm1 = xmm1[1,1]
paddw %xmm0, %xmm1
pshufd $1, %xmm1, %xmm0 ## xmm0 = xmm1[1,0,0,0]
paddw %xmm1, %xmm0
phaddw %xmm0, %xmm0
pextrb $0, %xmm0, %edx
instead of
pextrb $2, %xmm0, %esi
pextrb $0, %xmm0, %edx
addb %sil, %dl
pextrb $4, %xmm0, %esi
addb %dl, %sil
pextrb $6, %xmm0, %edx
addb %sil, %dl
pextrb $8, %xmm0, %esi
addb %dl, %sil
pextrb $10, %xmm0, %edi
pextrb $14, %xmm0, %edx
addb %sil, %dil
pextrb $12, %xmm0, %esi
addb %dil, %sil
addb %sil, %dl
llvm-svn: 170439
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 43 |
1 files changed, 31 insertions, 12 deletions
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d143f919cef..e3c76bb15dd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -817,34 +817,53 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { NewPhi->addIncoming(VectorStart, LoopBypassBlock); NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); - // Extract the first scalar. - Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and reduce the remaining vector elements. - for (unsigned i=1; i < VF; ++i) { - Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = NewPhi; + SmallVector<Constant*, 32> ShuffleMask(VF, 0); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i/2; ++j) + ShuffleMask[j] = Builder.getInt32(i/2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i/2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = + Builder.CreateShuffleVector(TmpVec, + UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), + "rdx.shuf"); + + // Emit the operation on the shuffled value. switch (RdxDesc.Kind) { case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); + TmpVec = Builder.CreateAdd(TmpVec, Shuf, "add.rdx"); break; case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); + TmpVec = Builder.CreateMul(TmpVec, Shuf, "mul.rdx"); break; case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); + TmpVec = Builder.CreateOr(TmpVec, Shuf, "or.rdx"); break; case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); + TmpVec = Builder.CreateAnd(TmpVec, Shuf, "and.rdx"); break; case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); + TmpVec = Builder.CreateXor(TmpVec, Shuf, "xor.rdx"); break; default: llvm_unreachable("Unknown reduction operation"); } } + // The result is in the first element of the vector. + Value *Scalar0 = Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); + // Now, we need to fix the users of the reduction variable // inside and outside of the scalar remainder loop. // We know that the loop is in LCSSA form. We need to update the |