summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h12
-rw-r--r--llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg2
-rw-r--r--llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll40
3 files changed, 54 insertions, 0 deletions
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index d2414b72a00..812d305da18 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -49,6 +49,18 @@ public:
return AddressSpace::ADDRESS_SPACE_GENERIC;
}
+ // NVPTX has infinite registers of all kinds, but the actual machine doesn't.
+ // We conservatively return 1 here which is just enough to enable the
+ // vectorizers but disables heuristics based on the number of registers.
+ // FIXME: Return a more reasonable number, while keeping an eye on
+ // LoopVectorizer's unrolling heuristics.
+ unsigned getNumberOfRegisters(bool Vector) const { return 1; }
+
+ // Only <2 x half> should be vectorized, so always return 32 for the vector
+ // register size.
+ unsigned getRegisterBitWidth(bool Vector) const { return 32; }
+ unsigned getMinVectorRegisterBitWidth() const { return 32; }
+
// Increase the inlining cost threshold by a factor of 5, reflecting that
// calls are particularly expensive in NVPTX.
unsigned getInliningThresholdMultiplier() { return 5; }
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
new file mode 100644
index 00000000000..2cb98eb371b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+ config.unsupported = True
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
new file mode 100644
index 00000000000..d8b80f437b9
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s
+; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR
+
+; CHECK-LABEL: @fusion
+; CHECK: load <2 x half>, <2 x half>*
+; CHECK: fmul fast <2 x half>
+; CHECK: fadd fast <2 x half>
+; CHECK: store <2 x half> %4, <2 x half>
+
+; NOVECTOR-LABEL: @fusion
+; NOVECTOR: load half
+; NOVECTOR: fmul fast half
+; NOVECTOR: fadd fast half
+; NOVECTOR: fmul fast half
+; NOVECTOR: fadd fast half
+; NOVECTOR: store half
+define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 {
+ %tmp = shl nuw nsw i32 %arg2, 6
+ %tmp4 = or i32 %tmp, %arg3
+ %tmp5 = shl nuw nsw i32 %tmp4, 2
+ %tmp6 = zext i32 %tmp5 to i64
+ %tmp7 = or i64 %tmp6, 1
+ %tmp10 = bitcast i8* %arg1 to half*
+ %tmp11 = getelementptr inbounds half, half* %tmp10, i64 %tmp6
+ %tmp12 = load half, half* %tmp11, align 8
+ %tmp13 = fmul fast half %tmp12, 0xH5380
+ %tmp14 = fadd fast half %tmp13, 0xH57F0
+ %tmp15 = bitcast i8* %arg to half*
+ %tmp16 = getelementptr inbounds half, half* %tmp15, i64 %tmp6
+ store half %tmp14, half* %tmp16, align 8
+ %tmp17 = getelementptr inbounds half, half* %tmp10, i64 %tmp7
+ %tmp18 = load half, half* %tmp17, align 2
+ %tmp19 = fmul fast half %tmp18, 0xH5380
+ %tmp20 = fadd fast half %tmp19, 0xH57F0
+ %tmp21 = getelementptr inbounds half, half* %tmp15, i64 %tmp7
+ store half %tmp20, half* %tmp21, align 2
+ ret void
+}
+
+attributes #0 = { nounwind }
OpenPOWER on IntegriCloud