7 files changed, 155 insertions, 35 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 65ab3e04237..7efb1850a27 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX906
 
-declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c)
+declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
 
-; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2
-; GFX906: v_dot2_f32_f16
-define amdgpu_kernel void @test_llvm_amdgcn_fdot2(
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
     float addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
   %c.val = load float, float addrspace(1)* %c
-  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val)
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 1)
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GFX906-LABEL: {{^}}test_llvm_amdgcn_fdot2_no_clamp
+; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
+    float addrspace(1)* %r,
+    <2 x half> addrspace(1)* %a,
+    <2 x half> addrspace(1)* %b,
+    float addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
+  %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
+  %c.val = load float, float addrspace(1)* %c
+  %r.val = call float @llvm.amdgcn.fdot2(<2 x half> %a.val, <2 x half> %b.val, float %c.val, i1 0)
   store float %r.val, float addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
index 0d8f28bbef1..f1894cc14cc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2
-; GFX906: v_dot2_i32_i16
-define amdgpu_kernel void @test_llvm_amdgcn_sdot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_clamp(
     i32 addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a,
     <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
   %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot2_no_clamp
+; GFX906: v_dot2_i32_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
index 8b664e6f9a4..2651200a344 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot4.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4
-; GFX906: v_dot4_i32_i8
-define amdgpu_kernel void @test_llvm_amdgcn_sdot4(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_clamp(
     i32 addrspace(1)* %r,
     <4 x i8> addrspace(1)* %a,
     <4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot4_no_clamp
+; GFX906: v_dot4_i32_i8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot4_no_clamp(
+    i32 addrspace(1)* %r,
+    <4 x i8> addrspace(1)* %a,
+    <4 x i8> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
+  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val.cast = bitcast <4 x i8> %a.val to i32
+  %b.val.cast = bitcast <4 x i8> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
index e2466eae539..456421c4984 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sdot8.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8
-; GFX906: v_dot8_i32_i4
-define amdgpu_kernel void @test_llvm_amdgcn_sdot8(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
     i32 addrspace(1)* %r,
     <8 x i4> addrspace(1)* %a,
     <8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
+; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
+    i32 addrspace(1)* %r,
+    <8 x i4> addrspace(1)* %a,
+    <8 x i4> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
+  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val.cast = bitcast <8 x i4> %a.val to i32
+  %b.val.cast = bitcast <8 x i4> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.sdot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
index b2912cb2334..18ca71d33bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c)
+declare i32 @llvm.amdgcn.udot2(<2 x i16> %a, <2 x i16> %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2
-; GFX906: v_dot2_u32_u16
-define amdgpu_kernel void @test_llvm_amdgcn_udot2(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_clamp(
     i32 addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a,
     <2 x i16> addrspace(1)* %b,
@@ -13,7 +13,23 @@ entry:
   %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
   %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot2_no_clamp
+; GFX906: v_dot2_u32_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot2_no_clamp(
+    i32 addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a,
+    <2 x i16> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %b.val = load <2 x i16>, <2 x i16> addrspace(1)* %b
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot2(<2 x i16> %a.val, <2 x i16> %b.val, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
index 5ce060de700..73d6a9ce968 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot4.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.udot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4
-; GFX906: v_dot4_u32_u8
-define amdgpu_kernel void @test_llvm_amdgcn_udot4(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_clamp
+; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot4_clamp(
     i32 addrspace(1)* %r,
     <4 x i8> addrspace(1)* %a,
     <4 x i8> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <4 x i8> %a.val to i32
   %b.val.cast = bitcast <4 x i8> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot4_no_clamp
+; GFX906: v_dot4_u32_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot4_no_clamp(
+    i32 addrspace(1)* %r,
+    <4 x i8> addrspace(1)* %a,
+    <4 x i8> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <4 x i8>, <4 x i8> addrspace(1)* %a
+  %b.val = load <4 x i8>, <4 x i8> addrspace(1)* %b
+  %a.val.cast = bitcast <4 x i8> %a.val to i32
+  %b.val.cast = bitcast <4 x i8> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot4(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
index 2599305bc8e..c2f80cac8f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.udot8.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX906
 
-declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c)
+declare i32 @llvm.amdgcn.udot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
 
-; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8
-; GFX906: v_dot8_u32_u4
-define amdgpu_kernel void @test_llvm_amdgcn_udot8(
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_clamp
+; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot8_clamp(
     i32 addrspace(1)* %r,
     <8 x i4> addrspace(1)* %a,
     <8 x i4> addrspace(1)* %b,
@@ -15,7 +15,25 @@ entry:
   %a.val.cast = bitcast <8 x i4> %a.val to i32
   %b.val.cast = bitcast <8 x i4> %b.val to i32
   %c.val = load i32, i32 addrspace(1)* %c
-  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val)
+  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 1)
+  store i32 %r.val, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_llvm_amdgcn_udot8_no_clamp
+; GFX906: v_dot8_u32_u4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+define amdgpu_kernel void @test_llvm_amdgcn_udot8_no_clamp(
+    i32 addrspace(1)* %r,
+    <8 x i4> addrspace(1)* %a,
+    <8 x i4> addrspace(1)* %b,
+    i32 addrspace(1)* %c) {
+entry:
+  %a.val = load <8 x i4>, <8 x i4> addrspace(1)* %a
+  %b.val = load <8 x i4>, <8 x i4> addrspace(1)* %b
+  %a.val.cast = bitcast <8 x i4> %a.val to i32
+  %b.val.cast = bitcast <8 x i4> %b.val to i32
+  %c.val = load i32, i32 addrspace(1)* %c
+  %r.val = call i32 @llvm.amdgcn.udot8(i32 %a.val.cast, i32 %b.val.cast, i32 %c.val, i1 0)
   store i32 %r.val, i32 addrspace(1)* %r
   ret void
 }