Introduce experimental generic intrinsics for horizontal vector reductions.

- This change allows targets to opt-in to using them instead of the log2 shufflevector algorithm. - The SLP and Loop vectorizers have the common code to do shuffle reductions factored out into LoopUtils, and now have a unified interface for generating reductions regardless of the preference of the target. LoopUtils now uses TTI to determine what kind of reductions the target wants to handle. - For CodeGen, basic legalization support is added. Differential Revision: https://reviews.llvm.org/D30086 llvm-svn: 302514
author: Amara Emerson <amara.emerson@arm.com> 2017-05-09 10:43:25 +0000
committer: Amara Emerson <amara.emerson@arm.com> 2017-05-09 10:43:25 +0000
commit: cf9daa33a7870c235e0edc176dd40579f376cafc (patch)
tree: 4df699a6f02c81cbbc2c7c4639c299a0dea5632c /llvm/include
parent: b7bf386e8098aed73f0b9b2df40067afc07dffab (diff)
download: bcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.tar.gz
bcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.zip
6 files changed, 148 insertions, 0 deletions
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index b9639dba188..a769d5f67dd 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -740,6 +740,19 @@ public:
                                 unsigned ChainSizeInBytes,
                                 VectorType *VecTy) const;
 
+  /// Flags describing the kind of vector reduction.
+  struct ReductionFlags {
+    ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {}
+    bool IsMaxOp;  ///< If the op a min/max kind, true if it's a max operation.
+    bool IsSigned; ///< Whether the operation is a signed int reduction.
+    bool NoNaN;    ///< If op is an fp min/max, whether NaNs may be present.
+  };
+
+  /// \returns True if the target wants to handle the given reduction idiom in
+  /// the intrinsics form instead of the shuffle form.
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const;
+
   /// @}
 
 private:
@@ -895,6 +908,8 @@ public:
   virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
                                         unsigned ChainSizeInBytes,
                                         VectorType *VecTy) const = 0;
+  virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                                     ReductionFlags) const = 0;
 };
 
 template <typename T>
@@ -1200,6 +1215,10 @@ public:
                                 VectorType *VecTy) const override {
     return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
   }
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             ReductionFlags Flags) const override {
+    return Impl.useReductionIntrinsic(Opcode, Ty, Flags);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index d7fda9e14b0..83b975e94f6 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -456,6 +456,12 @@ public:
                                 VectorType *VecTy) const {
     return VF;
   }
+
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const {
+    return false;
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ca0f3fbad89..2bc218f0aec 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -783,6 +783,20 @@ namespace ISD {
     /// known nonzero constant. The only operand here is the chain.
     GET_DYNAMIC_AREA_OFFSET,
 
+    /// Generic reduction nodes. These nodes represent horizontal vector
+    /// reduction operations, producing a scalar result.
+    /// The STRICT variants perform reductions in sequential order. The first
+    /// operand is an initial scalar accumulator value, and the second operand
+    /// is the vector to reduce.
+    VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL,
+    /// These reductions are non-strict, and have a single vector operand.
+    VECREDUCE_FADD, VECREDUCE_FMUL,
+    VECREDUCE_ADD, VECREDUCE_MUL,
+    VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR,
+    VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN,
+    /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
+    VECREDUCE_FMAX, VECREDUCE_FMIN,
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index bc689f3b01d..9d4c13c29f6 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -454,6 +454,45 @@ public:
                           MDNode *ScopeTag = nullptr,
                           MDNode *NoAliasTag = nullptr);
 
+  /// \brief Create a vector fadd reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value for ordered reductions.
+  CallInst *CreateFAddReduce(Value *Acc, Value *Src);
+
+  /// \brief Create a vector fmul reduction intrinsic of the source vector.
+  /// The first parameter is a scalar accumulator value for ordered reductions.
+  CallInst *CreateFMulReduce(Value *Acc, Value *Src);
+
+  /// \brief Create a vector int add reduction intrinsic of the source vector.
+  CallInst *CreateAddReduce(Value *Src);
+
+  /// \brief Create a vector int mul reduction intrinsic of the source vector.
+  CallInst *CreateMulReduce(Value *Src);
+
+  /// \brief Create a vector int AND reduction intrinsic of the source vector.
+  CallInst *CreateAndReduce(Value *Src);
+
+  /// \brief Create a vector int OR reduction intrinsic of the source vector.
+  CallInst *CreateOrReduce(Value *Src);
+
+  /// \brief Create a vector int XOR reduction intrinsic of the source vector.
+  CallInst *CreateXorReduce(Value *Src);
+
+  /// \brief Create a vector integer max reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false);
+
+  /// \brief Create a vector integer min reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false);
+
+  /// \brief Create a vector float max reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false);
+
+  /// \brief Create a vector float min reduction intrinsic of the source
+  /// vector.
+  CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false);
+
   /// \brief Create a lifetime.start intrinsic.
   ///
   /// If the pointer isn't i8* it will be converted.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 7b78d4d3d34..19f6045568f 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -812,6 +812,50 @@ def int_memcpy_element_atomic  : Intrinsic<[],
                                  [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
                                   WriteOnly<0>, ReadOnly<1>]>;
 
+//===------------------------ Reduction Intrinsics ------------------------===//
+//
+def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyfloat_ty,
+                                                     llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyfloat_ty,
+                                                     llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty],
+                                                  [llvm_anyvector_ty],
+                                                  [IntrNoMem]>;
+def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty],
+                                                   [llvm_anyvector_ty],
+                                                   [IntrNoMem]>;
+def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty],
+                                                    [llvm_anyvector_ty],
+                                                    [IntrNoMem]>;
+
 //===----- Intrinsics that are used to provide predicate information -----===//
 
 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
index a1cf41d6f93..94d10c98eb0 100644
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -21,6 +21,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
@@ -42,6 +43,7 @@ class PredIteratorCache;
 class ScalarEvolution;
 class SCEV;
 class TargetLibraryInfo;
+class TargetTransformInfo;
 
 /// \brief Captures loop safety information.
 /// It keep information for loop & its header may throw exception.
@@ -489,6 +491,30 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                         LoopSafetyInfo *SafetyInfo,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Create a target reduction of the given vector. The reduction operation
+/// is described by the \p Opcode parameter. min/max reductions require
+/// additional information supplied in \p Flags.
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *
+createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+                            unsigned Opcode, Value *Src,
+                            TargetTransformInfo::ReductionFlags Flags =
+                                TargetTransformInfo::ReductionFlags(),
+                            ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
+/// Create a generic target reduction using a recurrence descriptor \p Desc
+/// The target is queried to determine if intrinsics or shuffle sequences are
+/// required to implement the reduction.
+Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
+                             RecurrenceDescriptor &Desc, Value *Src,
+                             bool NoNaN = false);
+
+/// Get the intersection (logical and) of all of the potential IR flags
+/// of each scalar operation (VL) that will be converted into a vector (I).
+/// Flag set: NSW, NUW, exact, and all of fast-math.
+void propagateIRFlags(Value *I, ArrayRef<Value *> VL);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H
author	Amara Emerson <amara.emerson@arm.com>	2017-05-09 10:43:25 +0000
committer	Amara Emerson <amara.emerson@arm.com>	2017-05-09 10:43:25 +0000
commit	cf9daa33a7870c235e0edc176dd40579f376cafc (patch)
tree	4df699a6f02c81cbbc2c7c4639c299a0dea5632c /llvm/include
parent	b7bf386e8098aed73f0b9b2df40067afc07dffab (diff)
download	bcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.tar.gz bcm5719-llvm-cf9daa33a7870c235e0edc176dd40579f376cafc.zip