diff options
| -rw-r--r-- | llvm/include/llvm/Transforms/Utils/LoopUtils.h | 7 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/ExpandReductions.cpp | 15 | ||||
| -rw-r--r-- | llvm/lib/Transforms/Utils/LoopUtils.cpp | 32 | ||||
| -rw-r--r-- | llvm/test/CodeGen/Generic/expand-experimental-reductions.ll | 44 | 
4 files changed, 84 insertions, 14 deletions
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 6e854ed24a7..3a0e804e0cc 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -509,6 +509,13 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,                          LoopSafetyInfo *SafetyInfo,                          OptimizationRemarkEmitter *ORE = nullptr); +/// Generates an ordered vector reduction using extracts to reduce the value. +Value * +getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, unsigned Op, +                    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = +                        RecurrenceDescriptor::MRK_Invalid, +                    ArrayRef<Value *> RedOps = None); +  /// Generates a vector reduction using shufflevectors to reduce the value.  Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,                             RecurrenceDescriptor::MinMaxRecurrenceKind diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp index abf487a4f19..7552ba8cd85 100644 --- a/llvm/lib/CodeGen/ExpandReductions.cpp +++ b/llvm/lib/CodeGen/ExpandReductions.cpp @@ -78,13 +78,15 @@ RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {  bool expandReductions(Function &F, const TargetTransformInfo *TTI) {    bool Changed = false; -  SmallVector<IntrinsicInst*, 4> Worklist; +  SmallVector<IntrinsicInst *, 4> Worklist;    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)      if (auto II = dyn_cast<IntrinsicInst>(&*I))        Worklist.push_back(II);    for (auto *II : Worklist) {      IRBuilder<> Builder(II); +    bool IsOrdered = false; +    Value *Acc = nullptr;      Value *Vec = nullptr;      auto ID = II->getIntrinsicID();      auto MRK = RecurrenceDescriptor::MRK_Invalid; @@ -92,11 +94,10 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {      case Intrinsic::experimental_vector_reduce_fadd:      case Intrinsic::experimental_vector_reduce_fmul:        // FMFs must be attached to the call, otherwise it's an ordered reduction -      // and it can't be handled by generating this shuffle sequence. -      // TODO: Implement scalarization of ordered reductions here for targets -      // without native support. +      // and it can't be handled by generating a shuffle sequence.        if (!II->getFastMathFlags().isFast()) -        continue; +        IsOrdered = true; +      Acc = II->getArgOperand(0);        Vec = II->getArgOperand(1);        break;      case Intrinsic::experimental_vector_reduce_add: @@ -118,7 +119,9 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {      }      if (!TTI->shouldExpandReduction(II))        continue; -    auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); +    Value *Rdx = +        IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK) +                  : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);      II->replaceAllUsesWith(Rdx);      II->eraseFromParent();      Changed = true; diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 67e209583b7..805a003f18f 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1526,6 +1526,38 @@ static Value *addFastMathFlag(Value *V) {    return V;  } +// Helper to generate an ordered reduction. +Value * +llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src, +                          unsigned Op, +                          RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, +                          ArrayRef<Value *> RedOps) { +  unsigned VF = Src->getType()->getVectorNumElements(); + +  // Extract and apply reduction ops in ascending order: +  // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1] +  Value *Result = Acc; +  for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) { +    Value *Ext = +        Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx)); + +    if (Op != Instruction::ICmp && Op != Instruction::FCmp) { +      Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext, +                                   "bin.rdx"); +    } else { +      assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && +             "Invalid min/max"); +      Result = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, Result, +                                                    Ext); +    } + +    if (!RedOps.empty()) +      propagateIRFlags(Result, RedOps); +  } + +  return Result; +} +  // Helper to generate a log2 shuffle reduction.  Value *  llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, diff --git a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll index 472e66ce1dd..05fa6e324ac 100644 --- a/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll +++ b/llvm/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -117,8 +117,15 @@ entry:  define float @fadd_f32_strict(<4 x float> %vec) {  ; CHECK-LABEL: @fadd_f32_strict(  ; CHECK-NEXT:  entry: -; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) -; CHECK-NEXT:    ret float [[R]] +; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd float undef, [[TMP0]] +; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT:    ret float [[BIN_RDX3]]  ;  entry:    %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) @@ -128,8 +135,15 @@ entry:  define float @fadd_f32_strict_accum(float %accum, <4 x float> %vec) {  ; CHECK-LABEL: @fadd_f32_strict_accum(  ; CHECK-NEXT:  entry: -; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) -; CHECK-NEXT:    ret float [[R]] +; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fadd float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fadd float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT:    ret float [[BIN_RDX3]]  ;  entry:    %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %accum, <4 x float> %vec) @@ -169,8 +183,15 @@ entry:  define float @fmul_f32_strict(<4 x float> %vec) {  ; CHECK-LABEL: @fmul_f32_strict(  ; CHECK-NEXT:  entry: -; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) -; CHECK-NEXT:    ret float [[R]] +; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul float undef, [[TMP0]] +; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT:    ret float [[BIN_RDX3]]  ;  entry:    %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) @@ -180,8 +201,15 @@ entry:  define float @fmul_f32_strict_accum(float %accum, <4 x float> %vec) {  ; CHECK-LABEL: @fmul_f32_strict_accum(  ; CHECK-NEXT:  entry: -; CHECK-NEXT:    [[R:%.*]] = call float @llvm.experimental.vector.reduce.fmul.f32.f32.v4f32(float [[ACCUM:%.*]], <4 x float> [[VEC:%.*]]) -; CHECK-NEXT:    ret float [[R]] +; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x float> [[VEC:%.*]], i32 0 +; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul float [[ACCUM:%.*]], [[TMP0]] +; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[VEC]], i32 1 +; CHECK-NEXT:    [[BIN_RDX1:%.*]] = fmul float [[BIN_RDX]], [[TMP1]] +; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[VEC]], i32 2 +; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fmul float [[BIN_RDX1]], [[TMP2]] +; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[VEC]], i32 3 +; CHECK-NEXT:    [[BIN_RDX3:%.*]] = fmul float [[BIN_RDX2]], [[TMP3]] +; CHECK-NEXT:    ret float [[BIN_RDX3]]  ;  entry:    %r = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %accum, <4 x float> %vec)  | 

