1 files changed, 57 insertions, 54 deletions
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
index d663ae105f2..3cbce7caa76 100644
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -43,7 +43,7 @@ using namespace mlir::loop;
 using llvm::seq;
 
 // Extract an indexed value from KernelDim3.
-static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
+static ValuePtr getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
   switch (pos) {
   case 0:
     return dim3.x;
@@ -61,8 +61,8 @@ static Value *getDim3Value(const gpu::KernelDim3 &dim3, unsigned pos) {
 static Operation::operand_range getLowerBoundOperands(AffineForOp forOp) {
   return forOp.getLowerBoundOperands();
 }
-static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) {
-  SmallVector<Value *, 1> bounds(1, forOp.lowerBound());
+static SmallVector<ValuePtr, 1> getLowerBoundOperands(ForOp forOp) {
+  SmallVector<ValuePtr, 1> bounds(1, forOp.lowerBound());
   return bounds;
 }
 
@@ -70,33 +70,35 @@ static SmallVector<Value *, 1> getLowerBoundOperands(ForOp forOp) {
 static Operation::operand_range getUpperBoundOperands(AffineForOp forOp) {
   return forOp.getUpperBoundOperands();
 }
-static SmallVector<Value *, 1> getUpperBoundOperands(ForOp forOp) {
-  SmallVector<Value *, 1> bounds(1, forOp.upperBound());
+static SmallVector<ValuePtr, 1> getUpperBoundOperands(ForOp forOp) {
+  SmallVector<ValuePtr, 1> bounds(1, forOp.upperBound());
   return bounds;
 }
 
 // Get a Value that corresponds to the loop step.  If the step is an attribute,
 // materialize a corresponding constant using builder.
-static Value *getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrCreateStep(AffineForOp forOp, OpBuilder &builder) {
   return builder.create<ConstantIndexOp>(forOp.getLoc(), forOp.getStep());
 }
-static Value *getOrCreateStep(ForOp forOp, OpBuilder &) { return forOp.step(); }
+static ValuePtr getOrCreateStep(ForOp forOp, OpBuilder &) {
+  return forOp.step();
+}
 
 // Get a Value for the loop lower bound.  If the value requires computation,
 // materialize the instructions using builder.
-static Value *getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrEmitLowerBound(AffineForOp forOp, OpBuilder &builder) {
   return lowerAffineLowerBound(forOp, builder);
 }
-static Value *getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
+static ValuePtr getOrEmitLowerBound(ForOp forOp, OpBuilder &) {
   return forOp.lowerBound();
 }
 
 // Get a Value for the loop upper bound.  If the value requires computation,
 // materialize the instructions using builder.
-static Value *getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
+static ValuePtr getOrEmitUpperBound(AffineForOp forOp, OpBuilder &builder) {
   return lowerAffineUpperBound(forOp, builder);
 }
-static Value *getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
+static ValuePtr getOrEmitUpperBound(ForOp forOp, OpBuilder &) {
   return forOp.upperBound();
 }
 
@@ -212,18 +214,18 @@ struct LoopToGpuConverter {
                     unsigned numThreadDims);
 
   // Ranges of the loops mapped to blocks or threads.
-  SmallVector<Value *, 6> dims;
+  SmallVector<ValuePtr, 6> dims;
   // Lower bounds of the loops mapped to blocks or threads.
-  SmallVector<Value *, 6> lbs;
+  SmallVector<ValuePtr, 6> lbs;
   // Induction variables of the loops mapped to blocks or threads.
-  SmallVector<Value *, 6> ivs;
+  SmallVector<ValuePtr, 6> ivs;
   // Steps of the loops mapped to blocks or threads.
-  SmallVector<Value *, 6> steps;
+  SmallVector<ValuePtr, 6> steps;
 };
 } // namespace
 
 // Return true if the value is obviously a constant "one".
-static bool isConstantOne(Value *value) {
+static bool isConstantOne(ValuePtr value) {
   if (auto def = dyn_cast_or_null<ConstantIndexOp>(value->getDefiningOp()))
     return def.getValue() == 1;
   return false;
@@ -244,15 +246,15 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
   steps.reserve(numLoops);
   OpTy currentLoop = forOp;
   for (unsigned i = 0; i < numLoops; ++i) {
-    Value *lowerBound = getOrEmitLowerBound(currentLoop, builder);
-    Value *upperBound = getOrEmitUpperBound(currentLoop, builder);
+    ValuePtr lowerBound = getOrEmitLowerBound(currentLoop, builder);
+    ValuePtr upperBound = getOrEmitUpperBound(currentLoop, builder);
     if (!lowerBound || !upperBound) {
       return llvm::None;
     }
 
-    Value *range =
+    ValuePtr range =
         builder.create<SubIOp>(currentLoop.getLoc(), upperBound, lowerBound);
-    Value *step = getOrCreateStep(currentLoop, builder);
+    ValuePtr step = getOrCreateStep(currentLoop, builder);
     if (!isConstantOne(step))
       range = builder.create<SignedDivIOp>(currentLoop.getLoc(), range, step);
     dims.push_back(range);
@@ -274,8 +276,8 @@ Optional<OpTy> LoopToGpuConverter::collectBounds(OpTy forOp,
 /// `nids`. The innermost loop is mapped to the x-dimension, followed by the
 /// next innermost loop to y-dimension, followed by z-dimension.
 template <typename OpTy>
-OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids,
-                          ArrayRef<Value *> nids) {
+OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<ValuePtr> ids,
+                          ArrayRef<ValuePtr> nids) {
   auto nDims = ids.size();
   assert(nDims == nids.size());
   for (auto dim : llvm::seq<unsigned>(0, nDims)) {
@@ -295,11 +297,11 @@ OpTy createGPULaunchLoops(OpTy rootForOp, ArrayRef<Value *> ids,
 /// each workgroup/workitem and number of workgroup/workitems along a dimension
 /// of the launch into a container.
 void packIdAndNumId(gpu::KernelDim3 kernelIds, gpu::KernelDim3 kernelNids,
-                    unsigned nDims, SmallVectorImpl<Value *> &ids,
-                    SmallVectorImpl<Value *> &nids) {
+                    unsigned nDims, SmallVectorImpl<ValuePtr> &ids,
+                    SmallVectorImpl<ValuePtr> &nids) {
   assert(nDims <= 3 && "invalid number of launch dimensions");
-  SmallVector<Value *, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
-  SmallVector<Value *, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
+  SmallVector<ValuePtr, 3> allIds = {kernelIds.z, kernelIds.y, kernelIds.x};
+  SmallVector<ValuePtr, 3> allNids = {kernelNids.z, kernelNids.y, kernelNids.x};
   ids.clear();
   ids.append(std::next(allIds.begin(), allIds.size() - nDims), allIds.end());
   nids.clear();
@@ -317,7 +319,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
   auto returnOp = builder.create<gpu::ReturnOp>(launchOp.getLoc());
 
   rootForOp.getOperation()->moveBefore(returnOp);
-  SmallVector<Value *, 3> workgroupID, numWorkGroups;
+  SmallVector<ValuePtr, 3> workgroupID, numWorkGroups;
   packIdAndNumId(launchOp.getBlockIds(), launchOp.getGridSize(), numBlockDims,
                  workgroupID, numWorkGroups);
 
@@ -333,7 +335,7 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
     }
   }
 
-  SmallVector<Value *, 3> workItemID, workGroupSize;
+  SmallVector<ValuePtr, 3> workItemID, workGroupSize;
   packIdAndNumId(launchOp.getThreadIds(), launchOp.getBlockSize(),
                  numThreadDims, workItemID, workGroupSize);
   for (auto &loopOp : threadRootForOps) {
@@ -347,17 +349,17 @@ LogicalResult createLaunchBody(OpBuilder &builder, OpTy rootForOp,
 // given workgroup size and number of workgroups.
 template <typename OpTy>
 LogicalResult createLaunchFromOp(OpTy rootForOp,
-                                 ArrayRef<Value *> numWorkGroups,
-                                 ArrayRef<Value *> workGroupSizes) {
+                                 ArrayRef<ValuePtr> numWorkGroups,
+                                 ArrayRef<ValuePtr> workGroupSizes) {
   OpBuilder builder(rootForOp.getOperation());
   if (numWorkGroups.size() > 3) {
     return rootForOp.emitError("invalid ")
            << numWorkGroups.size() << "-D workgroup specification";
   }
   auto loc = rootForOp.getLoc();
-  Value *one = builder.create<ConstantOp>(
+  ValuePtr one = builder.create<ConstantOp>(
       loc, builder.getIntegerAttr(builder.getIndexType(), 1));
-  SmallVector<Value *, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
+  SmallVector<ValuePtr, 3> numWorkGroups3D(3, one), workGroupSize3D(3, one);
   for (auto numWorkGroup : enumerate(numWorkGroups)) {
     numWorkGroups3D[numWorkGroup.index()] = numWorkGroup.value();
   }
@@ -367,7 +369,7 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
 
   // Get the values used within the region of the rootForOp but defined above
   // it.
-  llvm::SetVector<Value *> valuesToForwardSet;
+  llvm::SetVector<ValuePtr> valuesToForwardSet;
   getUsedValuesDefinedAbove(rootForOp.region(), rootForOp.region(),
                             valuesToForwardSet);
   // Also add the values used for the lb, ub, and step of the rootForOp.
@@ -387,8 +389,8 @@ LogicalResult createLaunchFromOp(OpTy rootForOp,
   // defined outside. They all are replaced with kernel arguments.
   for (const auto &pair :
        llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
-    Value *from = std::get<0>(pair);
-    Value *to = std::get<1>(pair);
+    ValuePtr from = std::get<0>(pair);
+    ValuePtr to = std::get<1>(pair);
     replaceAllUsesInRegionWith(from, to, launchOp.body());
   }
   return success();
@@ -408,22 +410,23 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
   OpBuilder builder(rootForOp.getOperation());
   // Prepare the grid and block sizes for the launch operation.  If there is
   // no loop mapped to a specific dimension, use constant "1" as its size.
-  Value *constOne = (numBlockDims < 3 || numThreadDims < 3)
-                        ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
-                        : nullptr;
-  Value *gridSizeX = dims[0];
-  Value *gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
-  Value *gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
-  Value *blockSizeX = dims[numBlockDims];
-  Value *blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
-  Value *blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
+  ValuePtr constOne =
+      (numBlockDims < 3 || numThreadDims < 3)
+          ? builder.create<ConstantIndexOp>(rootForOp.getLoc(), 1)
+          : nullptr;
+  ValuePtr gridSizeX = dims[0];
+  ValuePtr gridSizeY = numBlockDims > 1 ? dims[1] : constOne;
+  ValuePtr gridSizeZ = numBlockDims > 2 ? dims[2] : constOne;
+  ValuePtr blockSizeX = dims[numBlockDims];
+  ValuePtr blockSizeY = numThreadDims > 1 ? dims[numBlockDims + 1] : constOne;
+  ValuePtr blockSizeZ = numThreadDims > 2 ? dims[numBlockDims + 2] : constOne;
 
   // Create a launch op and move the body region of the innermost loop to the
   // launch op.  Pass the values defined outside the outermost loop and used
   // inside the innermost loop and loop lower bounds as kernel data arguments.
   // Still assuming perfect nesting so there are no values other than induction
   // variables that are defined in one loop and used in deeper loops.
-  llvm::SetVector<Value *> valuesToForwardSet;
+  llvm::SetVector<ValuePtr> valuesToForwardSet;
   getUsedValuesDefinedAbove(innermostForOp.region(), rootForOp.region(),
                             valuesToForwardSet);
   auto valuesToForward = valuesToForwardSet.takeVector();
@@ -457,15 +460,15 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
                                 originallyForwardedValues);
   auto stepArgumentIt = std::next(lbArgumentIt, lbs.size());
   for (auto en : llvm::enumerate(ivs)) {
-    Value *id =
+    ValuePtr id =
         en.index() < numBlockDims
             ? getDim3Value(launchOp.getBlockIds(), en.index())
             : getDim3Value(launchOp.getThreadIds(), en.index() - numBlockDims);
-    Value *step = steps[en.index()];
+    ValuePtr step = steps[en.index()];
     if (!isConstantOne(step))
       id = builder.create<MulIOp>(rootForOp.getLoc(), step, id);
 
-    Value *ivReplacement =
+    ValuePtr ivReplacement =
         builder.create<AddIOp>(rootForOp.getLoc(), *lbArgumentIt, id);
     en.value()->replaceAllUsesWith(ivReplacement);
     replaceAllUsesInRegionWith(steps[en.index()], *stepArgumentIt,
@@ -479,8 +482,8 @@ void LoopToGpuConverter::createLaunch(OpTy rootForOp, OpTy innermostForOp,
   // trailing positions, make sure we don't touch those.
   for (const auto &pair :
        llvm::zip_first(valuesToForward, launchOp.getKernelArguments())) {
-    Value *from = std::get<0>(pair);
-    Value *to = std::get<1>(pair);
+    ValuePtr from = std::get<0>(pair);
+    ValuePtr to = std::get<1>(pair);
     replaceAllUsesInRegionWith(from, to, launchOp.body());
   }
 
@@ -510,8 +513,8 @@ static LogicalResult convertLoopNestToGPULaunch(OpTy forOp,
 // nested. The workgroup size and num workgroups is provided as input
 template <typename OpTy>
 static LogicalResult convertLoopToGPULaunch(OpTy forOp,
-                                            ArrayRef<Value *> numWorkGroups,
-                                            ArrayRef<Value *> workGroupSize) {
+                                            ArrayRef<ValuePtr> numWorkGroups,
+                                            ArrayRef<ValuePtr> workGroupSize) {
   if (failed(checkLoopOpMappable(forOp, numWorkGroups.size(),
                                  workGroupSize.size()))) {
     return failure();
@@ -532,7 +535,7 @@ LogicalResult mlir::convertLoopNestToGPULaunch(ForOp forOp,
 }
 
 LogicalResult mlir::convertLoopToGPULaunch(loop::ForOp forOp,
-                                           ArrayRef<Value *> numWorkGroups,
-                                           ArrayRef<Value *> workGroupSizes) {
+                                           ArrayRef<ValuePtr> numWorkGroups,
+                                           ArrayRef<ValuePtr> workGroupSizes) {
   return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
 }