summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorAmara Emerson <amara.emerson@arm.com>2017-05-09 10:43:25 +0000
committerAmara Emerson <amara.emerson@arm.com>2017-05-09 10:43:25 +0000
commitcf9daa33a7870c235e0edc176dd40579f376cafc (patch)
tree4df699a6f02c81cbbc2c7c4639c299a0dea5632c /llvm/lib
parentb7bf386e8098aed73f0b9b2df40067afc07dffab (diff)
downloadbcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.tar.gz
bcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.zip
Introduce experimental generic intrinsics for horizontal vector reductions.
- This change allows targets to opt-in to using them instead of the log2 shufflevector algorithm. - The SLP and Loop vectorizers have the common code to do shuffle reductions factored out into LoopUtils, and now have a unified interface for generating reductions regardless of the preference of the target. LoopUtils now uses TTI to determine what kind of reductions the target wants to handle. - For CodeGen, basic legalization support is added. Differential Revision: https://reviews.llvm.org/D30086 llvm-svn: 302514
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Analysis/TargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h1
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp58
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp88
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h2
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp13
-rw-r--r--llvm/lib/IR/IRBuilder.cpp88
-rw-r--r--llvm/lib/Transforms/Utils/LoopUtils.cpp202
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp37
-rw-r--r--llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp48
12 files changed, 480 insertions, 66 deletions
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 26d606cce9b..a73fe65e965 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -500,6 +500,12 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
}
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
+ Type *Ty, ReductionFlags Flags) const {
+ return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
+}
+
+
TargetTransformInfo::Concept::~Concept() {}
TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 722f17a8067..2d2249da4e1 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -23,6 +23,7 @@
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
using namespace llvm;
using namespace llvm::PatternMatch;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cde4331cc42..4c3b514856b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -675,6 +675,7 @@ private:
// Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
bool SplitVectorOperand(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+ SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo);
SDValue SplitVecOp_UnaryOp(SDNode *N);
SDValue SplitVecOp_TruncateHelper(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 97a7fab6efd..ff0e609803d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
case ISD::ZERO_EXTEND_VECTOR_INREG:
Res = SplitVecOp_ExtVecInRegOp(N);
break;
+
+ case ISD::VECREDUCE_FADD:
+ case ISD::VECREDUCE_FMUL:
+ case ISD::VECREDUCE_ADD:
+ case ISD::VECREDUCE_MUL:
+ case ISD::VECREDUCE_AND:
+ case ISD::VECREDUCE_OR:
+ case ISD::VECREDUCE_XOR:
+ case ISD::VECREDUCE_SMAX:
+ case ISD::VECREDUCE_SMIN:
+ case ISD::VECREDUCE_UMAX:
+ case ISD::VECREDUCE_UMIN:
+ case ISD::VECREDUCE_FMAX:
+ case ISD::VECREDUCE_FMIN:
+ Res = SplitVecOp_VECREDUCE(N, OpNo);
+ break;
}
}
@@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
}
+SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) {
+ EVT ResVT = N->getValueType(0);
+ SDValue Lo, Hi;
+ SDLoc dl(N);
+
+ SDValue VecOp = N->getOperand(OpNo);
+ EVT VecVT = VecOp.getValueType();
+ assert(VecVT.isVector() && "Can only split reduce vector operand");
+ GetSplitVector(VecOp, Lo, Hi);
+ EVT LoOpVT, HiOpVT;
+ std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT);
+
+ bool NoNaN = N->getFlags().hasNoNaNs();
+ unsigned CombineOpc = 0;
+ switch (N->getOpcode()) {
+ case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break;
+ case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break;
+ case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break;
+ case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break;
+ case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break;
+ case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break;
+ case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break;
+ case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break;
+ case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break;
+ case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break;
+ case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break;
+ case ISD::VECREDUCE_FMAX:
+ CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN;
+ break;
+ case ISD::VECREDUCE_FMIN:
+ CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN;
+ break;
+ default:
+ llvm_unreachable("Unexpected reduce ISD node");
+ }
+
+ // Use the appropriate scalar instruction on the split subvectors before
+ // reducing the now partially reduced smaller vector.
+ SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi);
+ return DAG.getNode(N->getOpcode(), dl, ResVT, Partial);
+}
+
SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
// The result has a legal vector type, but the input needs splitting.
EVT ResVT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d605a1dc1c2..ab1eaae164b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5970,7 +5970,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
unsigned NumOps = Ops.size();
switch (NumOps) {
case 0: return getNode(Opcode, DL, VT);
- case 1: return getNode(Opcode, DL, VT, Ops[0]);
+ case 1: return getNode(Opcode, DL, VT, Ops[0], Flags);
case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
default: break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 50313e2da88..4ccf8c97168 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5737,6 +5737,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
case Intrinsic::experimental_deoptimize:
LowerDeoptimizeCall(&I);
return nullptr;
+
+ case Intrinsic::experimental_vector_reduce_fadd:
+ case Intrinsic::experimental_vector_reduce_fmul:
+ case Intrinsic::experimental_vector_reduce_add:
+ case Intrinsic::experimental_vector_reduce_mul:
+ case Intrinsic::experimental_vector_reduce_and:
+ case Intrinsic::experimental_vector_reduce_or:
+ case Intrinsic::experimental_vector_reduce_xor:
+ case Intrinsic::experimental_vector_reduce_smax:
+ case Intrinsic::experimental_vector_reduce_smin:
+ case Intrinsic::experimental_vector_reduce_umax:
+ case Intrinsic::experimental_vector_reduce_umin:
+ case Intrinsic::experimental_vector_reduce_fmax:
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ visitVectorReduce(I, Intrinsic);
+ return nullptr;
+ }
+
}
}
@@ -7616,6 +7634,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
FuncInfo.MF->getFrameInfo().setHasPatchPoint();
}
+void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
+ unsigned Intrinsic) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Op1 = getValue(I.getArgOperand(0));
+ SDValue Op2;
+ if (I.getNumArgOperands() > 1)
+ Op2 = getValue(I.getArgOperand(1));
+ SDLoc dl = getCurSDLoc();
+ EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+ SDValue Res;
+ FastMathFlags FMF;
+ if (isa<FPMathOperator>(I))
+ FMF = I.getFastMathFlags();
+ SDNodeFlags SDFlags;
+ SDFlags.setNoNaNs(FMF.noNaNs());
+
+ switch (Intrinsic) {
+ case Intrinsic::experimental_vector_reduce_fadd:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmul:
+ if (FMF.unsafeAlgebra())
+ Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
+ else
+ Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
+ break;
+ case Intrinsic::experimental_vector_reduce_add:
+ Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_mul:
+ Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_and:
+ Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_or:
+ Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_xor:
+ Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smax:
+ Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_smin:
+ Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umax:
+ Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_umin:
+ Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
+ break;
+ case Intrinsic::experimental_vector_reduce_fmax: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
+ break;
+ }
+ case Intrinsic::experimental_vector_reduce_fmin: {
+ Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
+ break;
+ }
+ default:
+ llvm_unreachable("Unhandled vector reduce intrinsic");
+ }
+ setValue(&I, Res);
+}
+
/// Returns an AttributeList representing the attributes applied to the return
/// value of the given call.
static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 9e9989058ae..010104b05c5 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -909,6 +909,8 @@ private:
void visitGCRelocate(const GCRelocateInst &I);
void visitGCResult(const GCResultInst &I);
+ void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+
void visitUserOp1(const Instruction &I) {
llvm_unreachable("UserOp1 should not exist at instruction selection time!");
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 26dd45ef933..c37d7080f2c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::SETFALSE: return "setfalse";
case ISD::SETFALSE2: return "setfalse2";
}
+ case ISD::VECREDUCE_FADD: return "vecreduce_fadd";
+ case ISD::VECREDUCE_FMUL: return "vecreduce_fmul";
+ case ISD::VECREDUCE_ADD: return "vecreduce_add";
+ case ISD::VECREDUCE_MUL: return "vecreduce_mul";
+ case ISD::VECREDUCE_AND: return "vecreduce_and";
+ case ISD::VECREDUCE_OR: return "vecreduce_or";
+ case ISD::VECREDUCE_XOR: return "vecreduce_xor";
+ case ISD::VECREDUCE_SMAX: return "vecreduce_smax";
+ case ISD::VECREDUCE_SMIN: return "vecreduce_smin";
+ case ISD::VECREDUCE_UMAX: return "vecreduce_umax";
+ case ISD::VECREDUCE_UMIN: return "vecreduce_umin";
+ case ISD::VECREDUCE_FMAX: return "vecreduce_fmax";
+ case ISD::VECREDUCE_FMIN: return "vecreduce_fmin";
}
}
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index e265a823687..3477c087967 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align,
return CI;
}
+static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID,
+ Value *Src) {
+ Module *M = Builder->GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Src};
+ Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() };
+ auto Decl = Intrinsic::getDeclaration(M, ID, Tys);
+ return createCallHelper(Decl, Ops, Builder);
+}
+
+CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fadd, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
+ Module *M = GetInsertBlock()->getParent()->getParent();
+ Value *Ops[] = {Acc, Src};
+ Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(),
+ Src->getType()};
+ auto Decl = Intrinsic::getDeclaration(
+ M, Intrinsic::experimental_vector_reduce_fmul, Tys);
+ return createCallHelper(Decl, Ops, this);
+}
+
+CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateMulReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateAndReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateOrReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateXorReduce(Value *Src) {
+ return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor,
+ Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax
+ : Intrinsic::experimental_vector_reduce_umax;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) {
+ auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin
+ : Intrinsic::experimental_vector_reduce_umin;
+ return getReductionIntrinsic(this, ID, Src);
+}
+
+CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmax, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
+CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) {
+ auto Rdx = getReductionIntrinsic(
+ this, Intrinsic::experimental_vector_reduce_fmin, Src);
+ if (NoNaN) {
+ FastMathFlags FMF;
+ FMF.setNoNaNs();
+ Rdx->setFastMathFlags(FMF);
+ }
+ return Rdx;
+}
+
CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
assert(isa<PointerType>(Ptr->getType()) &&
"lifetime.start only applies to pointers.");
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 175d013a011..d41fe6267a0 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -18,6 +18,7 @@
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -1112,3 +1113,204 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
else
return (FalseVal + (TrueVal / 2)) / TrueVal;
}
+
+/// \brief Adds a 'fast' flag to floating point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V)) {
+ FastMathFlags Flags;
+ Flags.setUnsafeAlgebra();
+ cast<Instruction>(V)->setFastMathFlags(Flags);
+ }
+ return V;
+}
+
+// Helper to generate a log2 shuffle reduction.
+static Value *
+getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+ RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+ RecurrenceDescriptor::MRK_Invalid,
+ ArrayRef<Value *> RedOps = ArrayRef<Value *>()) {
+ unsigned VF = Src->getType()->getVectorNumElements();
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = Src;
+ SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i / 2; ++j)
+ ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+ UndefValue::get(Builder.getInt32Ty()));
+
+ Value *Shuf = Builder.CreateShuffleVector(
+ TmpVec, UndefValue::get(TmpVec->getType()),
+ ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ // Floating point operations had to be 'fast' to enable the reduction.
+ TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op,
+ TmpVec, Shuf, "bin.rdx"));
+ } else {
+ assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid &&
+ "Invalid min/max");
+ TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec,
+ Shuf);
+ }
+ if (!RedOps.empty())
+ propagateIRFlags(TmpVec, RedOps);
+ }
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
+/// Create a simple vector reduction specified by an opcode and some
+/// flags (if generating min/max reductions).
+Value *llvm::createSimpleTargetReduction(
+ IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+ Value *Src, TargetTransformInfo::ReductionFlags Flags,
+ ArrayRef<Value *> RedOps) {
+ assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+
+ Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType());
+ std::function<Value*()> BuildFunc;
+ using RD = RecurrenceDescriptor;
+ RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid;
+ // TODO: Support creating ordered reductions.
+ FastMathFlags FMFUnsafe;
+ FMFUnsafe.setUnsafeAlgebra();
+
+ switch (Opcode) {
+ case Instruction::Add:
+ BuildFunc = [&]() { return Builder.CreateAddReduce(Src); };
+ break;
+ case Instruction::Mul:
+ BuildFunc = [&]() { return Builder.CreateMulReduce(Src); };
+ break;
+ case Instruction::And:
+ BuildFunc = [&]() { return Builder.CreateAndReduce(Src); };
+ break;
+ case Instruction::Or:
+ BuildFunc = [&]() { return Builder.CreateOrReduce(Src); };
+ break;
+ case Instruction::Xor:
+ BuildFunc = [&]() { return Builder.CreateXorReduce(Src); };
+ break;
+ case Instruction::FAdd:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::FMul:
+ BuildFunc = [&]() {
+ auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src);
+ cast<CallInst>(Rdx)->setFastMathFlags(FMFUnsafe);
+ return Rdx;
+ };
+ break;
+ case Instruction::ICmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMaxReduce(Src, Flags.IsSigned);
+ };
+ } else {
+ MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin;
+ BuildFunc = [&]() {
+ return Builder.CreateIntMinReduce(Src, Flags.IsSigned);
+ };
+ }
+ break;
+ case Instruction::FCmp:
+ if (Flags.IsMaxOp) {
+ MinMaxKind = RD::MRK_FloatMax;
+ BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); };
+ } else {
+ MinMaxKind = RD::MRK_FloatMin;
+ BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); };
+ }
+ break;
+ default:
+ llvm_unreachable("Unhandled opcode");
+ break;
+ }
+ if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+ return BuildFunc();
+ return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
+}
+
+/// Create a vector reduction using a given recurrence descriptor.
+Value *llvm::createTargetReduction(IRBuilder<> &Builder,
+ const TargetTransformInfo *TTI,
+ RecurrenceDescriptor &Desc, Value *Src,
+ bool NoNaN) {
+ // TODO: Support in-order reductions based on the recurrence descriptor.
+ RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind();
+ TargetTransformInfo::ReductionFlags Flags;
+ Flags.NoNaN = NoNaN;
+ auto getSimpleRdx = [&](unsigned Opc) {
+ return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags);
+ };
+ switch (RecKind) {
+ case RecurrenceDescriptor::RK_FloatAdd:
+ return getSimpleRdx(Instruction::FAdd);
+ case RecurrenceDescriptor::RK_FloatMult:
+ return getSimpleRdx(Instruction::FMul);
+ case RecurrenceDescriptor::RK_IntegerAdd:
+ return getSimpleRdx(Instruction::Add);
+ case RecurrenceDescriptor::RK_IntegerMult:
+ return getSimpleRdx(Instruction::Mul);
+ case RecurrenceDescriptor::RK_IntegerAnd:
+ return getSimpleRdx(Instruction::And);
+ case RecurrenceDescriptor::RK_IntegerOr:
+ return getSimpleRdx(Instruction::Or);
+ case RecurrenceDescriptor::RK_IntegerXor:
+ return getSimpleRdx(Instruction::Xor);
+ case RecurrenceDescriptor::RK_IntegerMinMax: {
+ switch (Desc.getMinMaxRecurrenceKind()) {
+ case RecurrenceDescriptor::MRK_SIntMax:
+ Flags.IsSigned = true;
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMax:
+ Flags.IsMaxOp = true;
+ break;
+ case RecurrenceDescriptor::MRK_SIntMin:
+ Flags.IsSigned = true;
+ break;
+ case RecurrenceDescriptor::MRK_UIntMin:
+ break;
+ default:
+ llvm_unreachable("Unhandled MRK");
+ }
+ return getSimpleRdx(Instruction::ICmp);
+ }
+ case RecurrenceDescriptor::RK_FloatMinMax: {
+ Flags.IsMaxOp =
+ Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax;
+ return getSimpleRdx(Instruction::FCmp);
+ }
+ default:
+ llvm_unreachable("Unhandled RecKind");
+ }
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
+ if (auto *VecOp = dyn_cast<Instruction>(I)) {
+ if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+ // VecOVp is initialized to the 0th scalar, so start counting from index
+ // '1'.
+ VecOp->copyIRFlags(I0);
+ for (int i = 1, e = VL.size(); i < e; ++i) {
+ if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
+ VecOp->andIRFlags(Scalar);
+ }
+ }
+ }
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3fde0a45396..a9aa48395af 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1700,6 +1700,9 @@ public:
/// access that can be widened.
bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+ // Returns true if the NoNaN attribute is set on the function.
+ bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; }
+
private:
/// Check if a single basic block loop is vectorizable.
/// At this point we know that this is a loop with a constant trip count
@@ -4258,39 +4261,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
}
if (VF > 1) {
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = ReducedPartRdx;
- SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
- UndefValue::get(Builder.getInt32Ty()));
-
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()),
- ConstantVector::get(ShuffleMask), "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- TmpVec = addFastMathFlag(Builder.CreateBinOp(
- (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
- else
- TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
- TmpVec, Shuf);
- }
-
- // The result is in the first element of the vector.
+ bool NoNaN = Legal->hasFunNoNaNAttr();
ReducedPartRdx =
- Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
+ createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
// If the reduction can be performed in a smaller type, we need to extend
// the reduction to the wider type before we branch to the original loop.
if (Phi->getType() != RdxDesc.getRecurrenceType())
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index f112c555205..99084444bdd 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <memory>
@@ -212,23 +213,6 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
return Opcode;
}
-/// Get the intersection (logical and) of all of the potential IR flags
-/// of each scalar operation (VL) that will be converted into a vector (I).
-/// Flag set: NSW, NUW, exact, and all of fast-math.
-static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
- if (auto *VecOp = dyn_cast<Instruction>(I)) {
- if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
- // VecOVp is initialized to the 0th scalar, so start counting from index
- // '1'.
- VecOp->copyIRFlags(I0);
- for (int i = 1, e = VL.size(); i < e; ++i) {
- if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
- VecOp->andIRFlags(Scalar);
- }
- }
- }
-}
-
/// \returns true if all of the values in \p VL have the same type or false
/// otherwise.
static bool allSameType(ArrayRef<Value *> VL) {
@@ -4513,7 +4497,7 @@ public:
// Emit a reduction.
Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI);
if (VectorizedTree) {
Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
@@ -4583,33 +4567,31 @@ private:
/// \brief Emit a horizontal reduction of the vectorized value.
Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
+ unsigned ReduxWidth, ArrayRef<Value *> RedOps,
+ const TargetTransformInfo *TTI) {
assert(VectorizedValue && "Need to have a vectorized tree node");
assert(isPowerOf2_32(ReduxWidth) &&
"We only handle power-of-two reductions for now");
+ if (!IsPairwiseReduction)
+ return createSimpleTargetReduction(
+ Builder, TTI, ReductionOpcode, VectorizedValue,
+ TargetTransformInfo::ReductionFlags(), RedOps);
+
Value *TmpVec = VectorizedValue;
for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
- if (IsPairwiseReduction) {
- Value *LeftMask =
+ Value *LeftMask =
createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
- Value *RightMask =
+ Value *RightMask =
createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
- Value *LeftShuf = Builder.CreateShuffleVector(
+ Value *LeftShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
- Value *RightShuf = Builder.CreateShuffleVector(
+ Value *RightShuf = Builder.CreateShuffleVector(
TmpVec, UndefValue::get(TmpVec->getType()), (RightMask),
"rdx.shuf.r");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf,
- "bin.rdx");
- } else {
- Value *UpperHalf =
- createRdxShuffleMask(ReduxWidth, i, false, false, Builder);
- Value *Shuf = Builder.CreateShuffleVector(
- TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
- TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
- }
+ TmpVec =
+ Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx");
propagateIRFlags(TmpVec, RedOps);
}
OpenPOWER on IntegriCloud