3 files changed, 54 insertions, 0 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d2414b72a00..812d305da18 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -49,6 +49,18 @@ public:
     return AddressSpace::ADDRESS_SPACE_GENERIC;
   }
 
+  // NVPTX has infinite registers of all kinds, but the actual machine doesn't.
+  // We conservatively return 1 here which is just enough to enable the
+  // vectorizers but disables heuristics based on the number of registers.
+  // FIXME: Return a more reasonable number, while keeping an eye on
+  // LoopVectorizer's unrolling heuristics.
+  unsigned getNumberOfRegisters(bool Vector) const { return 1; }
+
+  // Only <2 x half> should be vectorized, so always return 32 for the vector
+  // register size.
+  unsigned getRegisterBitWidth(bool Vector) const { return 32; }
+  unsigned getMinVectorRegisterBitWidth() const { return 32; }
+
   // Increase the inlining cost threshold by a factor of 5, reflecting that
   // calls are particularly expensive in NVPTX.
   unsigned getInliningThresholdMultiplier() { return 5; }
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
new file mode 100644
index 00000000000..2cb98eb371b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
new file mode 100644
index 00000000000..d8b80f437b9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR
+
+; CHECK-LABEL: @fusion
+; CHECK: load <2 x half>, <2 x half>*
+; CHECK: fmul fast <2 x half>
+; CHECK: fadd fast <2 x half>
+; CHECK: store <2 x half> %4, <2 x half>
+
+; NOVECTOR-LABEL: @fusion
+; NOVECTOR: load half
+; NOVECTOR: fmul fast half
+; NOVECTOR: fadd fast half
+; NOVECTOR: fmul fast half
+; NOVECTOR: fadd fast half
+; NOVECTOR: store half
+define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
+  %tmp = shl nuw nsw i32 %arg2, 6
+  %tmp4 = or i32 %tmp, %arg3
+  %tmp5 = shl nuw nsw i32 %tmp4, 2
+  %tmp6 = zext i32 %tmp5 to i64
+  %tmp7 = or i64 %tmp6, 1
+  %tmp10 = bitcast i8* %arg1 to half*
+  %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6
+  %tmp12 = load half, half* %tmp11, align 8
+  %tmp13 = fmul fast half %tmp12, 0xH5380
+  %tmp14 = fadd fast half %tmp13, 0xH57F0
+  %tmp15 = bitcast i8* %arg to half*
+  %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6
+  store half %tmp14, half* %tmp16, align 8
+  %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7
+  %tmp18 = load half, half* %tmp17, align 2
+  %tmp19 = fmul fast half %tmp18, 0xH5380
+  %tmp20 = fadd fast half %tmp19, 0xH57F0
+  %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7
+  store half %tmp20, half* %tmp21, align 2
+  ret void
+}
+
+attributes #0 = { nounwind }