AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-03-21 21:39:51 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-03-21 21:39:51 +0000
commit: 3dbeefa978fb7e7b231b249f9cd90c67b9e83277 (patch)
tree: d74bf7fe30e44588d573919f3625edacb2586112 /llvm/test
parent: f6021ecddc73d14c94ad70938250d58f330795be (diff)
download: bcm5719-llvm-3dbeefa978fb7e7b231b249f9cd90c67b9e83277.tar.gz
bcm5719-llvm-3dbeefa978fb7e7b231b249f9cd90c67b9e83277.zip
714 files changed, 6249 insertions, 6249 deletions
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 76b21d26faa..6419eb11b2b 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: 'add_i32'
 ; CHECK: estimated cost of 1 for {{.*}} add i32
-define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %add = add i32 %vec, %b
   store i32 %add, i32 addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'add_v2i32'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
-define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %add = add <2 x i32> %vec, %b
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_v3i32'
 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
-define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %add = add <3 x i32> %vec, %b
   store <3 x i32> %add, <3 x i32> addrspace(1)* %out
@@ -30,7 +30,7 @@ define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_v4i32'
 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
-define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %add = add <4 x i32> %vec, %b
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_i64'
 ; CHECK: estimated cost of 2 for {{.*}} add i64
-define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %add = add i64 %vec, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'add_v2i64'
 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
-define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %add = add <2 x i64> %vec, %b
   store <2 x i64> %add, <2 x i64> addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v3i64'
 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
-define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %add = add <3 x i64> %vec, %b
   store <3 x i64> %add, <3 x i64> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v4i64'
 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
-define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %add = add <4 x i64> %vec, %b
   store <4 x i64> %add, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v16i64'
 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
-define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
+define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
   %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
   %add = add <16 x i64> %vec, %b
   store <16 x i64> %add, <16 x i64> addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)*
 
 ; CHECK: 'add_i16'
 ; CHECK: estimated cost of 1 for {{.*}} add i16
-define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %add = add i16 %vec, %b
   store i16 %add, i16 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #
 
 ; CHECK: 'add_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
-define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %add = add <2 x i16> %vec, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %va
 
 ; CHECK: 'sub_i32'
 ; CHECK: estimated cost of 1 for {{.*}} sub i32
-define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %sub = sub i32 %vec, %b
   store i32 %sub, i32 addrspace(1)* %out
@@ -111,7 +111,7 @@ define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'sub_i64'
 ; CHECK: estimated cost of 2 for {{.*}} sub i64
-define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %sub = sub i64 %vec, %b
   store i64 %sub, i64 addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 }
 ; CHECK: 'sub_i16'
 ; CHECK: estimated cost of 1 for {{.*}} sub i16
-define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %sub = sub i16 %vec, %b
   store i16 %sub, i16 addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #
 
 ; CHECK: 'sub_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
-define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %sub = sub <2 x i16> %vec, %b
   store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
index a809dbd77bb..aa70f5032cb 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'or_i32'
 ; CHECK: estimated cost of 1 for {{.*}} or i32
-define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = or i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0
 
 ; CHECK: 'or_i64'
 ; CHECK: estimated cost of 2 for {{.*}} or i64
-define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = or i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0
 
 ; CHECK: 'xor_i32'
 ; CHECK: estimated cost of 1 for {{.*}} xor i32
-define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = xor i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'xor_i64'
 ; CHECK: estimated cost of 2 for {{.*}} xor i64
-define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = xor i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'and_i32'
 ; CHECK: estimated cost of 1 for {{.*}} and i32
-define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = and i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'and_i64'
 ; CHECK: estimated cost of 2 for {{.*}} and i64
-define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = and i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/br.ll b/llvm/test/Analysis/CostModel/AMDGPU/br.ll
index 0b964939756..494f8d2c8b2 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/br.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/br.ll
@@ -4,7 +4,7 @@
 ; CHECK: estimated cost of 10 for instruction: br i1
 ; CHECK: estimated cost of 10 for instruction: br label
 ; CHECK: estimated cost of 10 for instruction: ret void
-define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
 
@@ -21,7 +21,7 @@ bb2:
 
 ; CHECK: 'test_switch_cost'
 ; CHECK: Unknown cost for instruction:   switch
-define void @test_switch_cost(i32 %a) #0 {
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
   switch i32 %a, label %default [
     i32 0, label %case0
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
index c328d768646..1efbb5873ac 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'extractelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %elt = extractelement <2 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %elt = extractelement <2 x float> %vec, i32 1
   store float %elt, float addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspac
 
 ; CHECK: 'extractelement_v3i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %elt = extractelement <3 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v4i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %elt = extractelement <4 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v8i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)
 ; FIXME: Should be non-0
 ; CHECK: 'extractelement_v8i32_dynindex'
 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 %idx
   store i32 %elt, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> add
 
 ; CHECK: 'extractelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %elt = extractelement <2 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v3i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %elt = extractelement <3 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v4i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %elt = extractelement <4 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v8i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %elt = extractelement <8 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v4i8'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
-define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
   %elt = extractelement <4 x i8> %vec, i8 1
   store i8 %elt, i8 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %
 
 ; CHECK: 'extractelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %elt = extractelement <2 x i16> %vec, i16 1
   store i16 %elt, i16 addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
index 9c551ec8afe..0d49e2967d2 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fabs = call float @llvm.fabs.f32(float %vec) #1
   store float %fabs, float addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
 
 ; CHECK: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; CHECK: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
   store <3 x float> %fabs, <3 x float> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 
 ; CHECK: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fabs = call double @llvm.fabs.f64(double %vec) #1
   store double %fabs, double addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0
 
 ; CHECK: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 
 ; CHECK: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
   store <3 x double> %fabs, <3 x double> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; CHECK: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fabs = call half @llvm.fabs.f16(half %vec) #1
   store half %fabs, half addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
 
 ; CHECK: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; CHECK: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
   store <3 x half> %fabs, <3 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index 00e91bd6223..d7ac7359299 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fadd float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fadd <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fadd_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
-define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
-define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
-define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL 'fadd_f16'
 ; ALL estimated cost of 1 for {{.*}} fadd half
-define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fadd half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL 'fadd_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fadd <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL 'fadd_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fadd <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index 3f374422ad9..caa9bff7b2a 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: 'fdiv_f32'
 ; ALL: estimated cost of 10 for {{.*}} fdiv float
-define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fdiv_v2f32'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -23,7 +23,7 @@ define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fdiv_v3f32'
 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
-define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
-define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
-define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL: 'fdiv_f16'
 ; ALL: estimated cost of 10 for {{.*}} fdiv half
-define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -77,7 +77,7 @@ define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL: 'fdiv_v2f16'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
-define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL: 'fdiv_v4f16'
 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
-define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index 6303bb7988c..915c35a23b3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fmul float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fmul_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fmul <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fmul_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
-define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
-define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL 'fmul_f16'
 ; ALL estimated cost of 1 for {{.*}} fmul half
-define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fmul half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL 'fmul_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
-define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fmul <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL 'fmul_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
-define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fmul <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index e0850be9867..cb89d292f71 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fsub float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fsub <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
-define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fsub half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL: 'fsub_v2f16'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
-define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fsub <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL: 'fsub_v4f16'
 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
-define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fsub <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 1765afe3169..6f296a3e7a3 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'insertelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %insert = insertelement <2 x i32> %vec, i32 1, i32 123
   store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
 
 ; CHECK: 'insertelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %insert = insertelement <2 x i64> %vec, i64 1, i64 123
   store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspa
 
 ; CHECK: 'insertelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %insert = insertelement <2 x i16> %vec, i16 1, i16 123
   store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspa
 
 ; CHECK: 'insertelement_v2i8'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
   %insert = insertelement <2 x i8> %vec, i8 1, i8 123
   store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index cbc755a6e6a..aac7b68f50c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'mul_i32'
 ; CHECK: estimated cost of 3 for {{.*}} mul i32
-define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
   store i32 %mul, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'mul_v2i32'
 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
-define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
   store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_v3i32'
 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
-define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
   store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_v4i32'
 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
-define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
   store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_i64'
 ; CHECK: estimated cost of 16 for {{.*}} mul i64
-define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
   store i64 %mul, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'mul_v2i64'
 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
-define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
   store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v3i64'
 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
-define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
   store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v4i64'
 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
-define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
   store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v8i64'
 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
-define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
   store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
index 003aed7b2fc..85fb0ebe14e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = shl i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -13,7 +13,7 @@ define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 ; ALL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; ALL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = lshr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b)
 
 ; ALL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = ashr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
-define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
index 319a697dfd3..d2266952259 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
index b4fa79a6ba9..6144ffea5b6 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -5,7 +5,7 @@
 ; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
 
 ; The post dominator tree does not have a root node in this case
-define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp2 = sext i32 %tmp to i64
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
index ca93dda2c57..7ade8eabd45 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define void @unreachable_loop(i32 %tidx) #0 {
+define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
 entry:
   unreachable
 
diff --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
index 669ee802c51..98fbc88a2cf 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -7,35 +7,35 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define void @workitem_id_x() #1 {
+define amdgpu_kernel void @workitem_id_x() #1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
-define void @workitem_id_y() #1 {
+define amdgpu_kernel void @workitem_id_y() #1 {
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.y, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-define void @workitem_id_z() #1 {
+define amdgpu_kernel void @workitem_id_z() #1 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store volatile i32 %id.z, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
-define void @mbcnt_lo() #1 {
+define amdgpu_kernel void @mbcnt_lo() #1 {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
   store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
-define void @mbcnt_hi() #1 {
+define amdgpu_kernel void @mbcnt_hi() #1 {
   %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
   store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index edad18e244d..ca661cf9a71 100644
--- a/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
@@ -24,7 +24,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
   %1 = load i32, i32 addrspace(3)* %0
@@ -35,7 +35,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
   %1 = load i32, i32 addrspace(3)* %0
@@ -48,7 +48,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
   %1 = load i32, i32 addrspace(3)* %0
@@ -60,7 +60,7 @@ entry:
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
   store i32 %x, i32 addrspace(1)* %out
@@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
@@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
@@ -111,7 +111,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -121,7 +121,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -132,7 +132,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index e10ffa00c43..56a9e7022db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 78bf9d8e352..ea2ad2ba83a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -5,7 +5,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 070d3bc1448..ea435725bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
index 94473367e0a..3496b1ab71f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -3,12 +3,12 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
-  define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
     %tmp0 = load i32, i32 addrspace(1)* %ptr1
     ret void
   }
-  define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
     %tmp2 = load i32, i32 addrspace(1)* %tmp1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index a3630b1ac92..8a6b3df9cff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %1 = load i32, i32 addrspace(2)* %0
@@ -21,7 +21,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %1 = load i32, i32 addrspace(2)* %0
@@ -36,7 +36,7 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %1 = load i32, i32 addrspace(2)* %0
@@ -51,7 +51,7 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(2)* %0
@@ -65,7 +65,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(2)* %0
@@ -79,7 +79,7 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(2)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/add-debug.ll b/llvm/test/CodeGen/AMDGPU/add-debug.ll
index 529905dd36a..b90c20b9748 100644
--- a/llvm/test/CodeGen/AMDGPU/add-debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-debug.ll
@@ -3,7 +3,7 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/add.i16.ll b/llvm/test/CodeGen/AMDGPU/add.i16.ll
index 6c5cdd3877d..b65e79f14de 100644
--- a/llvm/test/CodeGen/AMDGPU/add.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.i16.ll
@@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -89,7 +89,7 @@ define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index a6247c73524..7e4546d2cfb 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
   store i64 %0, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
@@ -169,7 +169,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
index db3c88a1edc..dcf6d24fddb 100644
--- a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -7,7 +7,7 @@
 
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = add <2 x i16> %a, %b
@@ -41,7 +41,7 @@ define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 
 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = add <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -54,7 +54,7 @@ define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrs
 
 ; VI: v_add_i32
 ; VI: v_add_i32
-define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = add <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
-define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> a
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
-define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -102,7 +102,7 @@ define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i1
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -121,7 +121,7 @@ define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -141,7 +141,7 @@ define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -173,7 +173,7 @@ define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -208,7 +208,7 @@ define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; VI: v_add_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -236,7 +236,7 @@ define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: buffer_store_dwordx2
-define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -264,7 +264,7 @@ define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/add_i128.ll b/llvm/test/CodeGen/AMDGPU/add_i128.ll
index c80157ca9c5..00a125c2e44 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i128.ll
@@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)*
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %foo, %a
   store i128 %result, i128 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* no
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %a, %foo
   store i128 %result, i128 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspa
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
   store i128 %result, i128 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/add_i64.ll b/llvm/test/CodeGen/AMDGPU/add_i64.ll
index 3d360b7d0b7..62733d5bfb6 100644
--- a/llvm/test/CodeGen/AMDGPU/add_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_i64.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
@@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
index 481a3e2b31b..138bc36b9e1 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@@ -9,7 +9,7 @@ declare void @consume_ptr2int(i32) #0
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
-define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -22,7 +22,7 @@ entry:
 ; CHECK: %data = alloca i32, align 4
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -35,7 +35,7 @@ entry:
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
-define void @addrspacecast_captured_call() #0 {
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 67a19399920..8cabc7dae13 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrs
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
   ret void
 }
 
 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
   ret void
 }
 
-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
   %val0 = extractvalue { i32, i1 } %val, 0
   store i32 %val0, i32 addrspace(1)* %out
@@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out)
 }
 
 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
   ret void
 }
 
 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
   store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
   ret void
 }
 
 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
   store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
   ret void
 }
 
 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index b6ada5e3824..52b70293153 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -28,7 +28,7 @@
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -58,7 +58,7 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
 
 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -73,7 +73,7 @@ define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -85,7 +85,7 @@ define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
@@ -102,7 +102,7 @@ define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
@@ -119,7 +119,7 @@ define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
@@ -133,7 +133,7 @@ define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
@@ -144,7 +144,7 @@ define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   load volatile i32, i32 addrspace(2)* %ftos
   ret void
@@ -158,7 +158,7 @@ define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -168,7 +168,7 @@ define void @cast_0_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -179,7 +179,7 @@ define void @cast_0_flat_to_group_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -189,7 +189,7 @@ define void @cast_neg1_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
   store volatile i32 7, i32 addrspace(3)* %cast
   ret void
@@ -204,7 +204,7 @@ define void @cast_neg1_flat_to_group_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %cast
   ret void
@@ -214,7 +214,7 @@ define void @cast_0_private_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
   store volatile i32 7, i32* %cast
   ret void
@@ -226,7 +226,7 @@ define void @cast_0_flat_to_private_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
   br i1 %cmp, label %local, label %global
@@ -259,7 +259,7 @@ end:
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index c0c5305b852..ef742f56fae 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -16,7 +16,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -24,7 +24,7 @@ entry:
   ret void
 }
 
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x i16>
@@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in)
   ret void
 }
 
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
   %bc = bitcast <2 x i16> %add.v2i16 to float
@@ -42,7 +42,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in)
   ret void
 }
 
-define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %fadd32 = fadd float %load, 1.0
   %bc = bitcast float %fadd32 to <2 x half>
@@ -51,7 +51,7 @@ define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in
   ret void
 }
 
-define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
   %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
   %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
   %bc = bitcast <2 x half> %add.v2f16 to float
@@ -60,14 +60,14 @@ define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in
   ret void
 }
 
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -76,7 +76,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
@@ -87,7 +87,7 @@ define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -96,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -112,7 +112,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index a6d055891d4..79450b97c21 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index d78c75165be..0e5605961e1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -4,7 +4,7 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
   store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a,
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float>
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index f812f4cf702..97cb9067f29 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -80,7 +80,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -102,7 +102,7 @@ entry:
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -127,7 +127,7 @@ entry:
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -162,7 +162,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -191,7 +191,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -235,7 +235,7 @@ for.end:
 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -258,7 +258,7 @@ entry:
 
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -281,7 +281,7 @@ entry:
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -305,7 +305,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -319,7 +319,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -332,7 +332,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -347,7 +347,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -360,7 +360,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -373,7 +373,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -394,7 +394,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -410,7 +410,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@@ -462,7 +462,7 @@ entry:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x i32>]
   %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@@ -506,7 +506,7 @@ define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x float>]
   %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, <16 x float>* %tmp0
@@ -522,7 +522,7 @@ define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [16 x <2 x float>]
   %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, <2 x float>* %tmp0
@@ -533,7 +533,7 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [0 x i32]
   store [0 x i32] [], [0 x i32]* %tmp
@@ -545,7 +545,7 @@ entry:
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [1 x i32]
   store [1 x i32] [i32 0], [1 x i32]* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e515ca00d18..187320805c1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -87,7 +87,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -117,7 +117,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -132,7 +132,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -153,7 +153,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -165,7 +165,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -181,7 +181,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -208,7 +208,7 @@ entry:
 ; FUNC-LABEL: {{^}}tidig_y_legacy:
 
 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,7 +221,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/and-gcn.ll b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
index dde5f8c2176..2aec03aff8a 100644
--- a/llvm/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/and-gcn.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 5d9dcf64deb..c356f8b87cf 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
 
   ; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
   %foo = add i32 %and, 1234567
   %bar = add i32 %foo, %b
@@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32
 
 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 
 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 64
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, -16
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, i1 addrspace(1)* %out
   ret void
@@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
   store volatile i64 %and0, i64 addrspace(1)* %out
@@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
@@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1231231234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 1231231234567
@@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 63
@@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspac
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, -8
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
@@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 084a6933da2..e2620ce353c 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
 
-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
 }
 
 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   %ld = load volatile i32, i32 addrspace(2)* %ftos
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index a4e7bb67d50..09750da4cb8 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.r600.read.tgid.y()
@@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   %val2 = call i32 @llvm.r600.read.tgid.z()
@@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tgid.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.y()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
index f37b0f3382f..04fbe2ae1f9 100644
--- a/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/llvm/test/CodeGen/AMDGPU/anonymous-gv.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
   store i32 1, i32 addrspace(1)* @0
   ret void
 }
 
 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 87b4c86427c..3f220c40841 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tmp = icmp eq i32 %cond, 0
   %tmp1 = zext i1 %tmp to i8
@@ -22,7 +22,7 @@ entry:
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index 7dfe342a2e9..daa3442097c 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -24,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index b914edf2928..ddeffc10a08 100644
--- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
index 08321563810..3a7facfd73a 100644
--- a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -13,7 +13,7 @@
 ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = ashr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -57,7 +57,7 @@ define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@ define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -86,7 +86,7 @@ define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
-define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -100,7 +100,7 @@ define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -135,7 +135,7 @@ define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)*
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 25eae0b41ae..4f9526ddab5 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
index 4b014e09b63..e0fe6641fa1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
index c6e5b1136d7..a0275893919 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index cab377feacb..63a6f6a8d32 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
 }
@@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
 }
@@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
 }
@@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 58fe6e235ee..ac2f7b4a4a4 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -8,7 +8,7 @@
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define void @max_9_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
@@ -49,7 +49,7 @@ define void @max_9_sgprs(i32 addrspace(1)* %out1,
 
 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
                                         i32 addrspace(1)* %out4,
@@ -90,7 +90,7 @@ stores:
 
 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index 97feb7276b7..979665ff0a8 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index c1c334283e6..3dda73bc336 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -6,7 +6,7 @@
 ; CHECK: VGPRBlocks: 32
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
@@ -18,7 +18,7 @@ attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
   ret void
 }
@@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
 }
@@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
 }
@@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
 }
@@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
 }
@@ -80,7 +80,7 @@ attributes #5 = {"amdgpu-waves-per-eu"="10"}
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
@@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
 }
@@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
 }
@@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
index 0282bc34c0e..17adb89900c 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-unparseable.ll
@@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
   ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
   ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
   ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
   ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
   ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
   ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
   ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/basic-branch.ll b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
index d4538ee2d25..e245e4296df 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-branch.ll
@@ -15,7 +15,7 @@
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -39,7 +39,7 @@ end:
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
diff --git a/llvm/test/CodeGen/AMDGPU/basic-loop.ll b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
index f0263caf5d6..de45190cdaa 100644
--- a/llvm/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/basic-loop.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
index e48744f40fb..5e39a6c6774 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -5,7 +5,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -32,7 +32,7 @@ define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -52,7 +52,7 @@ define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -68,7 +68,7 @@ define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -83,7 +83,7 @@ define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -110,7 +110,7 @@ define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 
 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
   %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@@ -130,7 +130,7 @@ define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
@@ -146,7 +146,7 @@ define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
   %sub = sub i32 32, %width
diff --git a/llvm/test/CodeGen/AMDGPU/bfe_uint.ll b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
index 32e3fc26106..2c8c9a5ec93 100644
--- a/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 5
   %1 = and i32 %0, 15 ; 0xf
@@ -17,7 +17,7 @@ entry:
 
 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 16
   %1 = and i32 %0, 65535 ; 0xffff
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 5156137fd78..7870e5f378d 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
   %1 = and i32 %z, %0
@@ -25,7 +25,7 @@ entry:
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
   %1 = and i32 %x, %0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = and i32 %x, %z
   %1 = or i32 %x, %z
diff --git a/llvm/test/CodeGen/AMDGPU/bfm.ll b/llvm/test/CodeGen/AMDGPU/bfm.ll
index 790458d0d60..5673995588d 100644
--- a/llvm/test/CodeGen/AMDGPU/bfm.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfm.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   %c = shl i32 %b, %y
@@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 
 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   store i32 %b, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 04384bf451e..cf95f74afb8 100644
--- a/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
 
@@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %o
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -70,7 +70,7 @@ define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)*
 
 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
   %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
   %bc = bitcast i64 %undef to <2 x i32>
   %elt1 = extractelement <2 x i32> %bc, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index f7dc1a9d37e..3616ec1f45d 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
   store i32 -1, i32 addrspace(1)* %out
   ret void
 }
@@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
   store i32 268435455, i32 addrspace(1)* %out
   ret void
 }
@@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
   store i32 -134217729, i32 addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
   store i32 33554432, i32 addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
 }
@@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
   store i32 -2113929216, i32 addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
   store i32 -1073741824, i32 addrspace(1)* %out
   ret void
 }
@@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
 }
@@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
   store i32 508, i32 addrspace(1)* %out
   ret void
 }
@@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 0)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2147483648)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
   call void asm sideeffect "; use $0", "s"(i32 33554432)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2113929216)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
   call void asm sideeffect "; use $0", "s"(i32 268435455)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -134217729)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 508)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
index 43a4200cb3b..539373f7bde 100644
--- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll
@@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
 
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
   %val = load i16, i16 addrspace(1)* %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalia
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val)
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 }
 
 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
   %val = load i64, i64 addrspace(1)* %valptr
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
@@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
index 3fe6f87c555..b7a0c8738df 100644
--- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -20,7 +20,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -59,7 +59,7 @@ two:
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
 
-define void @br_cc_f16_imm_a(
+define amdgpu_kernel void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -92,7 +92,7 @@ two:
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 86b8dd89e7d..ede15559c4f 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -5,7 +5,7 @@
 
 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
   %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
index 18bbc3e5f07..263059d4a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -68,7 +68,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -108,7 +108,7 @@ bb3:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -141,7 +141,7 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -188,7 +188,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   br label %bb2
 
@@ -243,7 +243,7 @@ bb3:
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -285,7 +285,7 @@ bb4:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -342,7 +342,7 @@ loop:
 ; GCN-NEXT: v_nop_e64
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp slt i32 %cond0, 0
@@ -399,7 +399,7 @@ bb3:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -462,7 +462,7 @@ endif:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
   %cmp0 = fcmp ogt float %reg, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/bswap.ll b/llvm/test/CodeGen/AMDGPU/bswap.ll
index c6895173109..d2dacd7c17b 100644
--- a/llvm/test/CodeGen/AMDGPU/bswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/bswap.ll
@@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
@@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(
 
 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll
index 0a5774c601d..d77b0ab9fbb 100644
--- a/llvm/test/CodeGen/AMDGPU/build_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call.ll b/llvm/test/CodeGen/AMDGPU/call.ll
index 6d101e1537c..769c7bb3eee 100644
--- a/llvm/test/CodeGen/AMDGPU/call.ll
+++ b/llvm/test/CodeGen/AMDGPU/call.ll
@@ -10,7 +10,7 @@
 
 declare i32 @external_function(i32) nounwind
 
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nounwind noinline {
   ret i32 %y
 }
 
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
index e2e4f164fe8..17601415081 100644
--- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_fi_lifetime:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %b = alloca i8
   call void @llvm.lifetime.start(i64 1, i8* %b)
@@ -18,7 +18,7 @@ entry:
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
   %tmp = alloca float
   store float 4.0, float *%tmp
   store float* %tmp, float* addrspace(3)* %ptr
@@ -38,7 +38,7 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
 
 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 4.0, float* %tmp0
@@ -54,7 +54,7 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
   ; Avoid optimizing everything out
@@ -73,7 +73,7 @@ define void @stored_fi_to_self() #0 {
 
 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
-define void @stored_fi_to_self_offset() #0 {
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
 
@@ -98,7 +98,7 @@ define void @stored_fi_to_self_offset() #0 {
 
 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-define void @stored_fi_to_fi() #0 {
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
   %tmp2 = alloca i32*
@@ -118,7 +118,7 @@ define void @stored_fi_to_fi() #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -136,7 +136,7 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   %tmp2 = alloca float
@@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca [4096 x i32]
   %tmp1 = alloca [4096 x i32]
   %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@@ -186,7 +186,7 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4
   %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 03d8cd9eeea..697f26b83a4 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -31,7 +31,7 @@ for.body:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -52,7 +52,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -74,7 +74,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -104,7 +104,7 @@ for.body:
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
index 75b87e48622..53fe89730f3 100644
--- a/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/cf_end.ll b/llvm/test/CodeGen/AMDGPU/cf_end.ll
index c74ee22868d..3c990e0a4bd 100644
--- a/llvm/test/CodeGen/AMDGPU/cf_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/cf_end.ll
@@ -4,6 +4,6 @@
 
 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 6db9a0761a0..ac35dd0bef5 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -43,7 +43,7 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -76,7 +76,7 @@ done:
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index b14797c7897..0ebff8996fb 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@@ -45,7 +45,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@@ -72,7 +72,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -99,7 +99,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@@ -131,7 +131,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -172,7 +172,7 @@ done:
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
 
-define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -209,7 +209,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -241,7 +241,7 @@ done:
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -271,7 +271,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@@ -300,7 +300,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@@ -333,7 +333,7 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@@ -365,7 +365,7 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@@ -396,7 +396,7 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@@ -426,7 +426,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@@ -464,7 +464,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@@ -494,7 +494,7 @@ done:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@@ -521,7 +521,7 @@ bb34:
 ; OPT: if:
 ; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
 ; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@@ -548,7 +548,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %sunkaddr2, i32 2 seq_cst
-define void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -574,7 +574,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %sunkaddr2, i32 undef, i32 2 seq_cst monotonic
-define void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -600,7 +600,7 @@ done:
 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
 ; OPT: br i1
 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-define void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -627,7 +627,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@@ -653,7 +653,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
   %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index a92f691e399..53adf09026e 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -76,7 +76,7 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -134,7 +134,7 @@ ret:
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
@@ -187,7 +187,7 @@ ret:
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
@@ -236,7 +236,7 @@ ret:
 
 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
@@ -283,7 +283,7 @@ ret:
 
 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 9b4b61cf728..208d97feb64 100644
--- a/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %c = icmp eq i32 %arg3, 0
@@ -35,7 +35,7 @@ bb2:
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
index 4c7875c3a03..3e1b76a1df0 100644
--- a/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
index 58517209267..155de5353bc 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64
diff --git a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
index 01572afa620..f8d4e01085c 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index 43a0e58a5ed..973c4544d97 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -703,7 +703,7 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
   %ptr0 = load volatile i32*, i32* addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
index ed4ec82eb3e..8820e4fd80e 100644
--- a/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
index 2e6be5d10f0..7394842d156 100644
--- a/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
   ret void
@@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
   ret void
@@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <1
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
   store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a,
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
   ret void
@@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
   ret void
@@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
   ret void
@@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
   ret void
@@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
   ret void
@@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
   ret void
@@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
   ret void
@@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
   ret void
@@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1
 
 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 34bb2588ad6..62b47beb125 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %and = and i32 %a, 1234567
     store volatile i32 %and, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -17,13 +17,13 @@
     ret void
   }
 
-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %shl = shl i32 %a, 12
     store volatile i32 %shl, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -34,13 +34,13 @@
     ret void
   }
 
-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %ashr = ashr i32 %a, 12
     store volatile i32 %ashr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -51,13 +51,13 @@
     ret void
   }
 
-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %lshr = lshr i32 %a, 12
     store volatile i32 %lshr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0ff75ab5800..0831d250b9e 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
@@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
   store i32 %and, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
@@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
   store i32 %or, i32 addrspace(1)* %out
@@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
@@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
   store i32 %xor, i32 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
   store i32 %xor, i32 addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
@@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 013e89fcdb5..d3e6c11ef90 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -72,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -150,7 +150,7 @@ endif:
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -272,7 +272,7 @@ end:
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index c3d155ce747..0074a41e44c 100644
--- a/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -30,7 +30,7 @@ bb5:                                              ; preds = %bb3, %bb
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
 
-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 7434d745b25..026dd7ca6c8 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
   ret void
@@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
   ret void
@@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
index 3422a889a52..f35b0706f3d 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32]
   br label %loop
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index 1a0027dd4a3..709c647baa5 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 
 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -101,7 +101,7 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -119,14 +119,14 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -172,7 +172,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -186,7 +186,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -202,7 +202,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspac
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %ctlz, 32
@@ -217,7 +217,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %ctlz, 32
@@ -230,7 +230,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -245,7 +245,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
   %cmp = icmp eq i16 %val, 0
@@ -260,7 +260,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
   %val = load i7, i7 addrspace(1)* %valptr.gep
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d390f64deea..87ba563a740 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 1
@@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop.ll b/llvm/test/CodeGen/AMDGPU/ctpop.ll
index 9692236bb36..a29e72ea57c 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop.ll
@@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
@@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
@@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
@@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
@@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
@@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
@@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index cd5d805e5db..2610684ad9e 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
@@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -146,7 +146,7 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
   %val = load i128, i128 addrspace(1)* %in, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index e33cc18eb05..1fa6407647e 100644
--- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/cube.ll b/llvm/test/CodeGen/AMDGPU/cube.ll
index 3c126a42fdd..7b5f1aff7ea 100644
--- a/llvm/test/CodeGen/AMDGPU/cube.ll
+++ b/llvm/test/CodeGen/AMDGPU/cube.ll
@@ -12,7 +12,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 7baaa81fba5..e16daa6fad9 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; GCN: {{buffer|flat}}_store_dword
 
 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
@@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
 }
 
 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
@@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
@@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
 }
 
 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
@@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
@@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
@@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
@@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index e7773c6e2a4..c10cf1a8a6f 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
@@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
@@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index d38411dcca6..9b771ebdf7b 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
@@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index a32c16dfac3..11acbc274eb 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index fb43ff4fbdd..ceff889b3a7 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %sint = load i32, i32 addrspace(1) * %in
@@ -24,7 +24,7 @@ entry:
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %uint = load i32, i32 addrspace(1) * %in
diff --git a/llvm/test/CodeGen/AMDGPU/debug.ll b/llvm/test/CodeGen/AMDGPU/debug.ll
index a2e0e878b74..f149aaddb8e 100644
--- a/llvm/test/CodeGen/AMDGPU/debug.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug.ll
@@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.
 
 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 49a7e722f29..734905ba2b0 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -23,7 +23,7 @@
 ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
index 7be7d9486a4..fcdbfb10a8c 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -22,7 +22,7 @@
 ; CHECK-NEXT: s_endpgm
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
index d63ba149ba6..764c60b12bf 100644
--- a/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
+++ b/llvm/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: ReservedVGPRCount: 4
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
index bfb692f318c..ad9111a2865 100644
--- a/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -12,7 +12,7 @@ define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -21,7 +21,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -30,7 +30,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -39,7 +39,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -48,7 +48,7 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -57,7 +57,7 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
 ; GCN-LABEL: {{^}}test_f16_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -66,7 +66,7 @@ define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1
 ; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -75,7 +75,7 @@ define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspac
 ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
+define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
   store half 0.0, half addrspace(1)* %out0
   store float 0.0, float addrspace(1)* %out1
   store double 0.0, double addrspace(1)* %out2
diff --git a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 9d70f67ef49..32e6f7cc0cd 100644
--- a/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,14 @@
 # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
-  define void @test2() { ret void }
-  define void @test3() { ret void }
-  define void @test4() { ret void }
-  define void @test5() { ret void }
-  define void @loop0() { ret void }
-  define void @loop1() { ret void }
-  define void @loop2() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
+  define amdgpu_kernel void @test2() { ret void }
+  define amdgpu_kernel void @test3() { ret void }
+  define amdgpu_kernel void @test4() { ret void }
+  define amdgpu_kernel void @test5() { ret void }
+  define amdgpu_kernel void @loop0() { ret void }
+  define amdgpu_kernel void @loop1() { ret void }
+  define amdgpu_kernel void @loop2() { ret void }
 ...
 ---
 # Combined use/def transfer check, the basics.
diff --git a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
index cdd2c0cd4f4..6dfe1294bb4 100644
--- a/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 5e1ebfde3e1..878b5ebe940 100644
--- a/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
diff --git a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index f461d6978f1..5997e27fd81 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 16fb019ae0f..d74bd5aa15a 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
-define void @write_ds_sub0_offset0_global() #0 {
+define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
@@ -24,7 +24,7 @@ entry:
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
-define void @add_x_shl_neg_to_sub_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -39,7 +39,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 {
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
-define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -58,7 +58,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -80,7 +80,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -95,7 +95,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -109,7 +109,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
index 9a313230e30..2c474dbe7b0 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -12,7 +12,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -31,7 +31,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -49,7 +49,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -66,7 +66,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -98,7 +98,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -133,7 +133,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -170,7 +170,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -196,7 +196,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -219,7 +219,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
@@ -243,7 +243,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
@@ -261,7 +261,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -280,7 +280,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @unaligned_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
@@ -296,7 +296,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
 ; SI-LABEL: @misaligned_2_simple_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
@@ -315,7 +315,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -331,7 +331,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: @simple_read2_f64_max_offset
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -349,7 +349,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -367,7 +367,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -385,7 +385,7 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
@@ -396,7 +396,7 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
@@ -410,7 +410,7 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
@@ -426,7 +426,7 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
@@ -437,7 +437,7 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
@@ -481,13 +481,13 @@ define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i
   ret void
 }
 
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 4a3f3fb9970..9668743cf12 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
-define void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
   %val0 = load float, float addrspace(3)* %ptr0
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 9d8375d6403..fc85ec06f58 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -12,7 +12,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
@@ -26,7 +26,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out)
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
@@ -43,7 +43,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
@@ -68,7 +68,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
@@ -88,7 +88,7 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
@@ -101,7 +101,7 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
@@ -117,7 +117,7 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
@@ -138,7 +138,7 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
@@ -153,7 +153,7 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -176,7 +176,7 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
index 149b4ce8312..81b35a46aa1 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -10,7 +10,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -29,7 +29,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -49,7 +49,7 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -69,7 +69,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -86,7 +86,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
 ; SI-LABEL: @odd_invalid_read2st64_f32_0
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -102,7 +102,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
@@ -122,7 +122,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -141,7 +141,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -161,7 +161,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -181,7 +181,7 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -201,7 +201,7 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 ; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -218,7 +218,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -239,7 +239,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 ; SI-NOT: ds_read2st_b64
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index ae230dac937..ab1cf0ba25b 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -27,7 +27,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -65,7 +65,7 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -86,7 +86,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
@@ -107,7 +107,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
@@ -126,7 +126,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
@@ -146,7 +146,7 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -164,7 +164,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -182,7 +182,7 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -212,7 +212,7 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -243,7 +243,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -270,7 +270,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -288,7 +288,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -306,7 +306,7 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -325,7 +325,7 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
+define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
@@ -335,7 +335,7 @@ define void @store_constant_adjacent_offsets() {
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
+define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
@@ -348,7 +348,7 @@ define void @store_constant_disjoint_offsets() {
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; SI: s_endpgm
-define void @store_misaligned64_constant_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
@@ -362,7 +362,7 @@ define void @store_misaligned64_constant_offsets() {
 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
@@ -371,7 +371,7 @@ define void @store_misaligned64_constant_large_offsets() {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
@@ -410,7 +410,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
-define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
index 872e7736140..a395af34b67 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -25,7 +25,7 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -66,7 +66,7 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
 ; SI-NOT: ds_write2st64_b64
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 580dc00f935..b1107ea7fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -4,7 +4,7 @@
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index d1624f86765..ace01593808 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 entry:
   %v = load double, double addrspace(1)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
 entry:
   %v = load double, double addrspace(2)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -62,7 +62,7 @@ endif:
 
 ; GCN-DAG: buffer_store_dword v
 ; GCN-DAG: buffer_store_dwordx2
-define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -93,7 +93,7 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
 ; GCN: buffer_store_dwordx4
-define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
index 5ae1db8c686..9439130deb9 100644
--- a/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/llvm/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -9,7 +9,7 @@
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -58,7 +58,7 @@ endif:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
 ; GCN: s_mov_b64 vcc, [[CMP]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
 entry:
   %v = load i32, i32 addrspace(1)* %in
   %cc = fcmp oeq float %k, 1.000000e+00
@@ -87,7 +87,7 @@ endif:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -128,7 +128,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -162,7 +162,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -187,7 +187,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -206,7 +206,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
 entry:
   %v = load float, float addrspace(2)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -227,7 +227,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 entry:
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
@@ -248,7 +248,7 @@ endif:
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
-define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -274,7 +274,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = icmp eq i32 %cond, 1
@@ -295,7 +295,7 @@ endif:
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i64, i64 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -320,7 +320,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -345,7 +345,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -364,7 +364,7 @@ endif:
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
-define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -385,7 +385,7 @@ done:
 ; GCN: {{^}}; BB#0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
-define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   br i1 undef, label %else, label %if
 
@@ -410,7 +410,7 @@ done:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -435,7 +435,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/elf.r600.ll b/llvm/test/CodeGen/AMDGPU/elf.r600.ll
index 51cd0850093..93c5e557503 100644
--- a/llvm/test/CodeGen/AMDGPU/elf.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/elf.r600.ll
@@ -9,7 +9,7 @@
 ; CONFIG-NEXT: .long   2
 ; CONFIG-NEXT: .long   165900
 ; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    store float %r, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/empty-function.ll b/llvm/test/CodeGen/AMDGPU/empty-function.ll
index a060900811e..1231cb4d1de 100644
--- a/llvm/test/CodeGen/AMDGPU/empty-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/empty-function.ll
@@ -7,14 +7,14 @@
 ; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
+define amdgpu_kernel void @empty_function_ret() #0 {
   ret void
 }
 
 ; SI: .text
 ; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
+define amdgpu_kernel void @empty_function_unreachable() #0 {
   unreachable
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 76b50b56301..6eb1fc1d0cc 100644
--- a/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -9,7 +9,7 @@
 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
 
 ; GCN-UNSAFE-NOT: xor
-define void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
index c67095438ee..bd861e0c663 100644
--- a/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,7 +12,7 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 8ef54b9e95d..40d115bfc06 100644
--- a/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
-define void @use_too_many_sgprs_tahiti() #0 {
+define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -20,7 +20,7 @@ define void @use_too_many_sgprs_tahiti() #0 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
-define void @use_too_many_sgprs_bonaire() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -39,7 +39,7 @@ define void @use_too_many_sgprs_bonaire() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
-define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -59,7 +59,7 @@ define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
-define void @use_too_many_sgprs_iceland() #2 {
+define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
   call void asm sideeffect "", "~{VCC}" ()
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
@@ -77,7 +77,7 @@ define void @use_too_many_sgprs_iceland() #2 {
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
-define void @use_too_many_sgprs_fiji() #3 {
+define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
index cf384da2c5b..0fa06b87eba 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: and_zext:
 ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -18,7 +18,7 @@ define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: or_zext:
 ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -33,7 +33,7 @@ define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: xor_zext:
 ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/extload-align.ll b/llvm/test/CodeGen/AMDGPU/extload-align.ll
index 9d2eb74c7ba..4644800421d 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-align.ll
@@ -9,7 +9,7 @@
 ; DEBUG: mem:LD2[<unknown>]{{[^(]}}
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define void @extload_align(i32* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %v0 = alloca [4 x i16]
   %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0
   %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll
index 505aef3258c..fd298b361d0 100644
--- a/llvm/test/CodeGen/AMDGPU/extload-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
 ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -14,7 +14,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
 ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -25,7 +25,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
 ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load i16, i16* %tmp0
@@ -36,7 +36,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
 ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load volatile i16, i16* %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/extload.ll b/llvm/test/CodeGen/AMDGPU/extload.ll
index 8b3e087d1f4..a7b8e86220a 100644
--- a/llvm/test/CodeGen/AMDGPU/extload.ll
+++ b/llvm/test/CodeGen/AMDGPU/extload.ll
@@ -10,7 +10,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -25,7 +25,7 @@ define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 a
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
@@ -40,7 +40,7 @@ define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i1
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -55,7 +55,7 @@ define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 ad
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>
diff --git a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index 4edff152e66..be85ca933c3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                     <4 x i32> addrspace(1)* noalias %out1,
                                                     i32 addrspace(1)* noalias %out2,
                                                     i32 addrspace(1)* %in) {
@@ -55,7 +55,7 @@ define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noal
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                             <4 x i32> addrspace(1)* noalias %out1,
                                                             i32 addrspace(1)* noalias %out2,
                                                             i32 addrspace(1)* %in) {
@@ -99,7 +99,7 @@ define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
                                                                      <4 x i32> addrspace(1)* noalias %out1,
                                                                      i64 addrspace(1)* noalias %out2,
                                                                      i32 addrspace(1)* %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 84c7955c138..1f567ae0508 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
@@ -26,7 +26,7 @@ define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrsp
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
   store half %elt, half addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@ define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@ define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -75,7 +75,7 @@ define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo)
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
   %p0 = extractelement <4 x half> %foo, i32 0
   %p1 = extractelement <4 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 10
@@ -95,7 +95,7 @@ define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo)
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 4594379dae0..db5bf0b4e80 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
   %elt = extractelement <3 x double> %ld, i32 2
   store volatile double %elt, double addrspace(1)* %out
@@ -13,14 +13,14 @@ define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double>
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
-define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
-define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index 22e04db7d3b..9b117d48a98 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -9,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
@@ -27,7 +27,7 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspac
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@ define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@ define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
@@ -82,7 +82,7 @@ define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0
 ; GFX9-DAG: buffer_store_short [[VLOAD0]], off
 ; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
 ; GFX9-DAG: buffer_store_short [[VLOAD1]], off
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
@@ -105,7 +105,7 @@ define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
@@ -131,7 +131,7 @@ define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16>
 ; GFX9: buffer_store_dword
 ; GFX9: buffer_load_ushort
 ; GFX9: buffer_store_short
-define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 1df91c93329..a8d127879a3 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -20,7 +20,7 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -30,14 +30,14 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
@@ -46,14 +46,14 @@ define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> ad
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
-define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 6f4ae827f43..b7d768fd552 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, i8 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -29,7 +29,7 @@ define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -43,7 +43,7 @@ define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -57,7 +57,7 @@ define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -71,7 +71,7 @@ define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -85,7 +85,7 @@ define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -99,7 +99,7 @@ define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -120,7 +120,7 @@ define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
@@ -141,7 +141,7 @@ define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %fo
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index e160c20a03a..34999fa3aea 100644
--- a/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
@@ -20,7 +20,7 @@ define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_f64
 ; GCN: buffer_store_dword v
-define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
    %a = load double, double addrspace(1)* %in
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
@@ -33,7 +33,7 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
 ; GCN: buffer_store_dword
-define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
@@ -45,7 +45,7 @@ define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 add
 ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
@@ -57,7 +57,7 @@ entry:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
 
-define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
@@ -68,7 +68,7 @@ entry:
 ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 5ff9dcd49c5..b80fbdaf8ea 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 
-define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   store half %fabs, half addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
 ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_fabs_f16(half addrspace(1)* %out, half %in) {
 
 ; GFX9: s_load_dword [[VAL:s[0-9]+]]
 ; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
   ret void
@@ -68,7 +68,7 @@ define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
 ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, <4 x half> addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@ define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
 ; VI-NOT: and
 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
   store half %fmul, half addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
 ; GCN-LABEL: {{^}}v_fabs_v2f16:
 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
-define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
@@ -110,7 +110,7 @@ define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}fabs_free_v2f16:
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
-define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
   %bc = bitcast i32 %in to <2 x half>
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -133,7 +133,7 @@ define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
 
 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
-define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %fmul = fmul <2 x half> %fabs, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index f7780b875ff..998e02f7bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -10,7 +10,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 ; FUNC-LABEL: {{^}}v_fabs_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
@@ -24,7 +24,7 @@ define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI: v_and_b32
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @fabs_f64(double addrspace(1)* %out, double %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, <4 x double> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1)
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in
 ; FUNC-LABEL: {{^}}fabs_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, double addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; FUNC-LABEL: {{^}}fabs_fn_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index 98e7f9e3e9a..ac8fa3e45ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -13,7 +13,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 
 ; GCN: v_and_b32
 
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -37,7 +37,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
 ; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; GCN: v_and_b32
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b74bce76f79..9edf55cbc69 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -28,7 +28,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -55,7 +55,7 @@ define void @fast_add_fmuladd_fmul() #0 {
 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-define void @fast_sub_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -87,7 +87,7 @@ define void @fast_sub_fmuladd_fmul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -120,7 +120,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -145,7 +145,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -170,7 +170,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -205,7 +205,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
 
 ; GCN: buffer_store_dword [[MUL]]
 ; GCN: buffer_store_dword [[MAD]]
-define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -241,7 +241,7 @@ define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_subrev_f32_e32
-define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
index f7aa8446592..0861d327987 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16(
+define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_a(
+define amdgpu_kernel void @fadd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_b(
+define amdgpu_kernel void @fadd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16(
+define amdgpu_kernel void @fadd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -111,7 +111,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_a(
+define amdgpu_kernel void @fadd_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_b(
+define amdgpu_kernel void @fadd_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fadd.ll b/llvm/test/CodeGen/AMDGPU/fadd.ll
index 989eb03268e..621a0de281d 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 ; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
    %add = fadd float %a, %b
    store float %add, float addrspace(1)* %out, align 4
    ret void
@@ -16,7 +16,7 @@ define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
   %add = fadd <2 x float> %a, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -31,7 +31,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -57,7 +57,7 @@ define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
   %add = fadd <8 x float> %a, %b
   store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -65,7 +65,7 @@ define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x flo
 
 ; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32:
 ; SI-NOT: v_add_f32
-define void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
+define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
    %add = fadd float %a, 0.0
    store float %add, float addrspace(1)* %out, align 4
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fadd64.ll b/llvm/test/CodeGen/AMDGPU/fadd64.ll
index 6f0c9de8eba..7eb7747de21 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                         double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -14,7 +14,7 @@ define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
 ; CHECK: _store_dwordx4
-define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: _store_dwordx4
-define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index e1edc26d7ed..a82b310726a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -9,7 +9,7 @@ declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -19,7 +19,7 @@ define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %v
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
@@ -40,7 +40,7 @@ define void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fabs = call half @llvm.fabs.f16(half %val)
   %val.fabs.fneg = fsub half -0.0, %val.fabs
@@ -52,7 +52,7 @@ define void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %val.fneg = fsub half -0.0, %val
   %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
@@ -63,7 +63,7 @@ define void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -90,7 +90,7 @@ define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -108,7 +108,7 @@ define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@ define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspa
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -126,7 +126,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -135,7 +135,7 @@ define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspa
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -153,7 +153,7 @@ define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -162,7 +162,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -171,7 +171,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@ define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@ define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@ define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@ define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -229,7 +229,7 @@ define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
@@ -246,7 +246,7 @@ define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
@@ -265,7 +265,7 @@ define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %o
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
@@ -280,7 +280,7 @@ define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
 ; GFX9: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -290,7 +290,7 @@ define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 ze
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@ define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -317,7 +317,7 @@ define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -326,7 +326,7 @@ define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -335,7 +335,7 @@ define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -344,7 +344,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addr
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspa
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -362,7 +362,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addr
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -371,7 +371,7 @@ define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -380,7 +380,7 @@ define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -398,7 +398,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -407,7 +407,7 @@ define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -416,7 +416,7 @@ define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -425,7 +425,7 @@ define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 075a6482713..8c385f40b1c 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -8,7 +8,7 @@ declare double @llvm.canonicalize.f64(double) #0
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -27,7 +27,7 @@ define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
@@ -38,7 +38,7 @@ define void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fabs = call float @llvm.fabs.f32(float %val)
   %val.fabs.fneg = fsub float -0.0, %val.fabs
@@ -50,7 +50,7 @@ define void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %val.fneg = fsub float -0.0, %val
   %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
@@ -61,7 +61,7 @@ define void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@ define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -79,7 +79,7 @@ define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@ define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -97,7 +97,7 @@ define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -106,7 +106,7 @@ define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -115,7 +115,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -142,7 +142,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -151,7 +151,7 @@ define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -160,7 +160,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -196,7 +196,7 @@ define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -205,7 +205,7 @@ define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -215,7 +215,7 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val)
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
@@ -235,7 +235,7 @@ define void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fabs = call double @llvm.fabs.f64(double %val)
   %val.fabs.fneg = fsub double -0.0, %val.fabs
@@ -247,7 +247,7 @@ define void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %val.fneg = fsub double -0.0, %val
   %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
@@ -259,7 +259,7 @@ define void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@ define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@ define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@ define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@ define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -319,7 +319,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -329,7 +329,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)*
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -339,7 +339,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -349,7 +349,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -359,7 +359,7 @@ define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -369,7 +369,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -379,7 +379,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -389,7 +389,7 @@ define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -399,7 +399,7 @@ define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -409,7 +409,7 @@ define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fceil.ll b/llvm/test/CodeGen/AMDGPU/fceil.ll
index efdda78f852..0b913fda858 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @fceil_f32(float addrspace(1)* %out, float %x) {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@ define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
   store <3 x float> %y, <3 x float> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index 98448db5dd2..61572a85562 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -31,7 +31,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fceil_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -61,7 +61,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
index 530274f920f..7f8be804309 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -4,7 +4,7 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
index c402805feb3..2a848e80b81 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
index 8a01ea2fe43..7916226462f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt(
+define amdgpu_kernel void @fcmp_f16_lt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt_abs(
+define amdgpu_kernel void @fcmp_f16_lt_abs(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -62,7 +62,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_eq(
+define amdgpu_kernel void @fcmp_f16_eq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -85,7 +85,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_le(
+define amdgpu_kernel void @fcmp_f16_le(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -108,7 +108,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_gt(
+define amdgpu_kernel void @fcmp_f16_gt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -131,7 +131,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lg(
+define amdgpu_kernel void @fcmp_f16_lg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -154,7 +154,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ge(
+define amdgpu_kernel void @fcmp_f16_ge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -177,7 +177,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_o(
+define amdgpu_kernel void @fcmp_f16_o(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -200,7 +200,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_u(
+define amdgpu_kernel void @fcmp_f16_u(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -223,7 +223,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nge(
+define amdgpu_kernel void @fcmp_f16_nge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -246,7 +246,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlg(
+define amdgpu_kernel void @fcmp_f16_nlg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -269,7 +269,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ngt(
+define amdgpu_kernel void @fcmp_f16_ngt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -292,7 +292,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nle(
+define amdgpu_kernel void @fcmp_f16_nle(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -315,7 +315,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_neq(
+define amdgpu_kernel void @fcmp_f16_neq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -338,7 +338,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlt(
+define amdgpu_kernel void @fcmp_f16_nlt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -368,7 +368,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lt(
+define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -398,7 +398,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_eq(
+define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -428,7 +428,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_le(
+define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -458,7 +458,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_gt(
+define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -488,7 +488,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lg(
+define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -518,7 +518,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ge(
+define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -548,7 +548,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_o(
+define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -578,7 +578,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_u(
+define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -608,7 +608,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nge(
+define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -638,7 +638,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlg(
+define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -668,7 +668,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ngt(
+define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -698,7 +698,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nle(
+define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -728,7 +728,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_neq(
+define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -758,7 +758,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlt(
+define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp.ll b/llvm/test/CodeGen/AMDGPU/fcmp.ll
index 97d954fcc3c..b548670edb0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp.ll
@@ -3,7 +3,7 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/fcmp64.ll b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
index acce82fdfe5..b9e1921d4c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcmp64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -27,7 +27,7 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -39,7 +39,7 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -51,7 +51,7 @@ define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -63,7 +63,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fconst64.ll b/llvm/test/CodeGen/AMDGPU/fconst64.ll
index 89af37545c9..12559779624 100644
--- a/llvm/test/CodeGen/AMDGPU/fconst64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fconst64.ll
@@ -5,7 +5,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 1e7490a4ff7..79f8392a9b5 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -23,7 +23,7 @@ declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_f16(
+define amdgpu_kernel void @test_copysign_f16(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -43,7 +43,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
   float addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
   double addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -88,7 +88,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
   float addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -111,7 +111,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
   double addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -136,7 +136,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -161,7 +161,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -188,7 +188,7 @@ entry:
 ; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
   half addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -204,7 +204,7 @@ entry:
 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
   half addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -221,7 +221,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v2f16(
+define amdgpu_kernel void @test_copysign_v2f16(
   <2 x half> addrspace(1)* %arg_out,
   <2 x half> %arg_mag,
   <2 x half> %arg_sign) {
@@ -236,7 +236,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v3f16(
+define amdgpu_kernel void @test_copysign_v3f16(
   <3 x half> addrspace(1)* %arg_out,
   <3 x half> %arg_mag,
   <3 x half> %arg_sign) {
@@ -252,7 +252,7 @@ entry:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v4f16(
+define amdgpu_kernel void @test_copysign_v4f16(
   <4 x half> addrspace(1)* %arg_out,
   <4 x half> %arg_mag,
   <4 x half> %arg_sign) {
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 632de18dafc..e5893e5995a 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -20,7 +20,7 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 12c942beee6..67779a8ff3b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -17,7 +17,7 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -32,7 +32,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
@@ -41,7 +41,7 @@ define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +49,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 4dc99042f80..7f84e973c95 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -31,7 +31,7 @@
 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16(
+define amdgpu_kernel void @v_fdiv_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) #0 {
@@ -54,7 +54,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,7 +72,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -91,7 +91,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -109,7 +109,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -127,7 +127,7 @@ entry:
 ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -147,7 +147,7 @@ entry:
 ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -168,7 +168,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
+define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -209,7 +209,7 @@ entry:
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 2.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -221,7 +221,7 @@ define void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, 10.0
   store half %rcp, half addrspace(1)* %out, align 4
@@ -233,7 +233,7 @@ define void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
 
 ; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
 ; VI: buffer_store_short [[MUL]]
-define void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
   %x = load half, half addrspace(1)* undef
   %rcp = fdiv arcp half %x, -10.0
   store half %rcp, half addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index 04e1d5ac601..d16bdf43ee2 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -29,7 +29,7 @@
 ; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
   %num = load volatile double, double addrspace(1)* %in
   %den = load volatile double, double addrspace(1)* %gep.1
@@ -39,7 +39,7 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
+define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
   %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, d
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
   %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
@@ -55,14 +55,14 @@ define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, d
 }
 
 ; GCN-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
+define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
   %num = load <2 x double>, <2 x double> addrspace(1)* %in
   %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
@@ -72,14 +72,14 @@ define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %num = load <4 x double>, <4 x double> addrspace(1)* %in
   %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
@@ -89,7 +89,7 @@ define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspac
 }
 
 ; GCN-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
+define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -110,7 +110,7 @@ define void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, double addrspace(1)* %out, align 4
@@ -122,7 +122,7 @@ define void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
   %x = load double, double addrspace(1)* undef
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, double addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 0e95de9c555..b3a2b664372 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -27,7 +27,7 @@
 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 ; GCN-NOT: s_setreg
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_rcp_f32
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
-define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -77,7 +77,7 @@ entry:
 ; GCN: v_fma_f32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -89,7 +89,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -104,7 +104,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -119,7 +119,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
 ; GCN: v_cmp_gt_f32
 ; GCN: v_cmp_gt_f32
-define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -161,7 +161,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -176,7 +176,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -197,7 +197,7 @@ entry:
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -220,7 +220,7 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -243,7 +243,7 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
index 83ffbdfa23a..407cccb8443 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -19,7 +19,7 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
 ; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
   %neg = fsub double 0.0, %x
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
   store double %y, double addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
 ; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
   %abs = call double @llvm.fabs.f64(double %x)
   %neg = fsub double 0.0, %abs
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
@@ -61,7 +61,7 @@ define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ffloor_v2f64:
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI-NOT: v_floor_f64_e32
-define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
   store <3 x double> %y, <3 x double> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -121,7 +121,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ffloor.ll b/llvm/test/CodeGen/AMDGPU/ffloor.ll
index d7f35a45075..720fe7a45e3 100644
--- a/llvm/test/CodeGen/AMDGPU/ffloor.ll
+++ b/llvm/test/CodeGen/AMDGPU/ffloor.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}floor_f32:
 ; SI: v_floor_f32_e32
 ; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) {
   %tmp = call float @llvm.floor.f32(float %in) #0
   store float %tmp, float addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @floor_f32(float addrspace(1)* %out, float %in) {
 ; SI: v_floor_f32_e32
 ; SI: v_floor_f32_e32
 
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
   store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; R600: FLOOR
 ; R600: FLOOR
 ; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
   store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 81a3ebca3a0..c867e4fca22 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,7 +17,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
   ret void
@@ -25,7 +25,7 @@ define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
 
 ; CHECK-LABEL: {{^}}store_flat_i64:
 ; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
   ret void
@@ -33,7 +33,7 @@ define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
 
 ; CHECK-LABEL: {{^}}store_flat_v4i32:
 ; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
   ret void
@@ -41,7 +41,7 @@ define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
 ; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %y = trunc i32 %x to i16
   store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
@@ -50,7 +50,7 @@ define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
 ; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %y = trunc i32 %x to i8
   store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
@@ -61,7 +61,7 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
 
 ; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
   %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
@@ -70,7 +70,7 @@ define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noa
 
 ; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
   %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
   store i64 %fload, i64 addrspace(1)* %out, align 8
@@ -79,7 +79,7 @@ define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 
 ; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
   %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
@@ -88,7 +88,7 @@ define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> add
 
 ; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
@@ -98,7 +98,7 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
@@ -108,7 +108,7 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
@@ -118,7 +118,7 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 
 ; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
@@ -131,7 +131,7 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
-define void @flat_scratch_unaligned_load() {
+define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
@@ -143,7 +143,7 @@ define void @flat_scratch_unaligned_load() {
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
-define void @flat_scratch_unaligned_store() {
+define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %fptr, align 1
@@ -154,7 +154,7 @@ define void @flat_scratch_unaligned_store() {
 ; HSA: flat_load_dword
 ; HSA: flat_load_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_load() {
+define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
@@ -165,7 +165,7 @@ define void @flat_scratch_multidword_load() {
 ; HSA: flat_store_dword
 ; HSA: flat_store_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_store() {
+define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index df9ba00c697..dac1500cd46 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -23,7 +23,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ entry:
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
 entry:
   %out.addr = alloca i32 addrspace(1)*, align 4
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index b71c8bcb76c..23f40daf3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -19,7 +19,7 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_no_flat() {
+define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
   ret void
@@ -33,7 +33,7 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-define void @vcc_no_flat() {
+define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
   ret void
@@ -50,7 +50,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 8
 ; HSA-VI-NOXNACK: ; NumSgprs: 8
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_flat() {
+define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
   ret void
@@ -66,7 +66,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 10
 ; HSA-VI-NOXNACK: ; NumSgprs: 10
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @vcc_flat() {
+define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 7c91471e30a..cc95d80570e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
-  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+  define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -725,7 +725,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -735,7 +735,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -746,7 +746,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -757,7 +757,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -769,7 +769,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -778,7 +778,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -788,7 +788,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -798,7 +798,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -809,7 +809,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -819,7 +819,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -829,7 +829,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -840,7 +840,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -851,7 +851,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -860,7 +860,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -869,7 +869,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -879,7 +879,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -890,7 +890,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
   %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -901,7 +901,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(4)* %out
@@ -911,7 +911,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -923,7 +923,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
@@ -933,7 +933,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -942,7 +942,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
   ret void
@@ -950,7 +950,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -960,7 +960,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0bd6c2dd5b8..723dde9ab68 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -723,7 +723,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -733,7 +733,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -743,7 +743,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -754,7 +754,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -765,7 +765,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -774,7 +774,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -783,7 +783,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -793,7 +793,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -804,7 +804,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
   %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -815,7 +815,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(4)* %out
@@ -825,7 +825,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -837,7 +837,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -847,7 +847,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -856,7 +856,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
   ret void
@@ -864,7 +864,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -874,7 +874,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -883,7 +883,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -892,7 +892,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -902,7 +902,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -913,7 +913,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -924,7 +924,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -936,7 +936,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -945,7 +945,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -955,7 +955,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -965,7 +965,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 50c5a5abf7f..4113ba8dc1f 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -18,7 +18,7 @@ declare float @llvm.fma.f32(float, float, float) #0
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -46,7 +46,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -75,7 +75,7 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -99,7 +99,7 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -127,7 +127,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -184,7 +184,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -213,7 +213,7 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -242,7 +242,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -276,7 +276,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -317,7 +317,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -358,7 +358,7 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
 ; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -390,7 +390,7 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -406,7 +406,7 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -422,7 +422,7 @@ define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -438,7 +438,7 @@ define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -454,7 +454,7 @@ define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -470,7 +470,7 @@ define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -486,7 +486,7 @@ define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -502,7 +502,7 @@ define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -518,7 +518,7 @@ define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -534,7 +534,7 @@ define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -550,7 +550,7 @@ define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -566,7 +566,7 @@ define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -588,7 +588,7 @@ define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
-define void @test_f32_interp(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
                              float addrspace(1)* %in3) {
@@ -610,7 +610,7 @@ define void @test_f32_interp(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
-define void @test_f64_interp(double addrspace(1)* %out,
+define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
                              double addrspace(1)* %in3) {
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f64.ll b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
index cf6d7d82499..4d3f3712621 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f64.ll
@@ -8,7 +8,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -21,7 +21,7 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -36,7 +36,7 @@ define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fma.ll b/llvm/test/CodeGen/AMDGPU/fma.ll
index d04a5946b98..659cecb59eb 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
@@ -50,7 +50,7 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
 ; EG-DAG: FMA {{\*? *}}[[RES]].Y
 ; EG-DAG: FMA {{\*? *}}[[RES]].Z
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
@@ -62,7 +62,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -77,7 +77,7 @@ define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, fl
 }
 
 ; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
index 4d42a4630e2..8b9104b79e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -11,7 +11,7 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
   %a = load volatile double, double addrspace(1)* %aptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 7c01ca85f6b..a96eb5db9e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -10,7 +10,7 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -28,7 +28,7 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index da498caa6b5..083346e9d1c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -19,7 +19,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -34,7 +34,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -49,7 +49,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
index 4a4c92a38a3..7643c3ea533 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -33,7 +33,7 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -53,7 +53,7 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -73,7 +73,7 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -93,7 +93,7 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -114,7 +114,7 @@ define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
-define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NOT: v_max_
 
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
index fec3a358a4f..20af278bf98 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmax_f64
 ; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.maxnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmax_v2f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
index 4058247a6da..277b8ce04c4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -14,7 +14,7 @@ declare double @llvm.maxnum.f64(double, double)
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -70,7 +70,7 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -114,7 +114,7 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -128,7 +128,7 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -171,7 +171,7 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -185,7 +185,7 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -199,7 +199,7 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -213,7 +213,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -227,7 +227,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -262,7 +262,7 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 5a651c73766..d2cfc713ed3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float a
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -63,7 +63,7 @@ define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float add
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -79,7 +79,7 @@ define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float add
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -96,7 +96,7 @@ define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, flo
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@ define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float ad
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -128,7 +128,7 @@ define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -146,7 +146,7 @@ define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addr
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -169,7 +169,7 @@ define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrs
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -192,7 +192,7 @@ define void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out,
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -215,7 +215,7 @@ define void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out,
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
-define void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -238,7 +238,7 @@ define void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out,
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -267,7 +267,7 @@ define void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %ou
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
-define void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -301,7 +301,7 @@ define void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %ou
 ; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
 ; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -341,7 +341,7 @@ define void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspa
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -363,7 +363,7 @@ define void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -385,7 +385,7 @@ define void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -407,7 +407,7 @@ define void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -429,7 +429,7 @@ define void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -451,7 +451,7 @@ define void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -473,7 +473,7 @@ define void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -495,7 +495,7 @@ define void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -517,7 +517,7 @@ define void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -539,7 +539,7 @@ define void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -561,7 +561,7 @@ define void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float a
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -583,7 +583,7 @@ define void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -605,7 +605,7 @@ define void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -627,7 +627,7 @@ define void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -649,7 +649,7 @@ define void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -671,7 +671,7 @@ define void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float
 ; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
-define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -697,7 +697,7 @@ define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float
 ; GCN-DAG: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -716,7 +716,7 @@ define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
-define void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -735,7 +735,7 @@ define void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
-define void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -755,7 +755,7 @@ define void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, floa
 
 
 ; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
-define void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -773,7 +773,7 @@ define void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
-define void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -796,7 +796,7 @@ define void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
-define void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -819,7 +819,7 @@ define void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
-define void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -849,7 +849,7 @@ define void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, floa
 ; GCN: v_max_f32
 ; GCN: v_min_f32
 ; GCN: v_max_f32
-define void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -874,7 +874,7 @@ define void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(
 ; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
-define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
@@ -901,7 +901,7 @@ define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float add
 
 ; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -938,7 +938,7 @@ define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half add
 ; VI: v_max_f16
 
 ; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
-define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 3102ffdbdd2..3183f77f090 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -11,7 +11,7 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -29,7 +29,7 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 6982ee0c0cb..99bc114831c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -3,7 +3,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
    %r0 = extractelement <4 x double> %reg0, i32 0
    %r1 = extractelement <4 x double> %reg0, i32 1
    %r2 = fcmp uge double %r0, %r1
@@ -14,7 +14,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -29,7 +29,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -44,7 +44,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -59,7 +59,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
index 79acd02e6d1..52336f95a90 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; EG: MIN *
 ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -34,7 +34,7 @@ define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
 
-define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
   store float %val, float addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, floa
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -65,7 +65,7 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -84,7 +84,7 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -103,7 +103,7 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -122,7 +122,7 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -144,7 +144,7 @@ define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
@@ -166,7 +166,7 @@ define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x flo
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -188,7 +188,7 @@ define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
index 0f929d6a81f..01b26741121 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmin_f64
 ; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmin_v2f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll
index abd2b9d3e4d..9e997c7a104 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -43,7 +43,7 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -113,7 +113,7 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -127,7 +127,7 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -156,7 +156,7 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -170,7 +170,7 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -198,7 +198,7 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -212,7 +212,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -237,7 +237,7 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -248,7 +248,7 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 553ab18d9f7..4002712ab16 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -23,7 +23,7 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
   %a13 = fadd fast float %x, -1.0
@@ -44,7 +44,7 @@ define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
@@ -59,7 +59,7 @@ define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -72,7 +72,7 @@ define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -87,7 +87,7 @@ define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x
 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
@@ -101,7 +101,7 @@ define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
@@ -119,7 +119,7 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 ; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -146,7 +146,7 @@ define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -166,7 +166,7 @@ define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -185,7 +185,7 @@ define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 
-define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -203,7 +203,7 @@ define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroe
 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -219,7 +219,7 @@ define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 ze
 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
index 3ccd09b67fa..75977ada77b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16(
+define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_a(
+define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_b(
+define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -83,7 +83,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16(
+define amdgpu_kernel void @fmul_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -110,7 +110,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_a(
+define amdgpu_kernel void @fmul_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -135,7 +135,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_b(
+define amdgpu_kernel void @fmul_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.ll b/llvm/test/CodeGen/AMDGPU/fmul.ll
index 8228fd0a506..125de7aabfd 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.ll
@@ -6,7 +6,7 @@
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -49,7 +49,7 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
   store float %z, float addrspace(1)* %out
@@ -61,7 +61,7 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
 ; GCN-NOT: v_mul_f32
 ; GCN-NOT: v_mad_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
   store float %z, float addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
   %c = fmul float %b, %y
diff --git a/llvm/test/CodeGen/AMDGPU/fmul64.ll b/llvm/test/CodeGen/AMDGPU/fmul64.ll
index 3c222eaba89..f14233f267b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fmul_v2f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 477ae3da507..9b713419e74 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #1
 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
                          half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
   %r0 = load half, half addrspace(1)* %in1
   %r1 = load half, half addrspace(1)* %in2
@@ -34,7 +34,7 @@ define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -56,7 +56,7 @@ define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -82,7 +82,7 @@ define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +111,7 @@ define void @fadd_a_a_b_f16(half addrspace(1)* %out,
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,7 +134,7 @@ define void @fadd_b_a_a_f16(half addrspace(1)* %out,
 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -180,7 +180,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -202,7 +202,7 @@ define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -231,7 +231,7 @@ define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspa
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -291,7 +291,7 @@ define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -323,7 +323,7 @@ define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -355,7 +355,7 @@ define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, hal
 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -388,7 +388,7 @@ define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -419,7 +419,7 @@ define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e4b1053ff25..fb605dd2e4b 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -25,7 +25,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -45,7 +45,7 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; GCN-DENORM-STRICT: v_mul_f32_e32
 ; GCN-DENORM-STRICT: v_add_f32_e32
-define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load volatile float, float addrspace(1)* %in1
   %r1 = load volatile float, float addrspace(1)* %in2
@@ -71,7 +71,7 @@ define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; SI-DENORM buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,7 +100,7 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -132,7 +132,7 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -167,7 +167,7 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -196,7 +196,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -225,7 +225,7 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -256,7 +256,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -286,7 +286,7 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -318,7 +318,7 @@ define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -353,7 +353,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -387,7 +387,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -422,7 +422,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -496,7 +496,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -532,7 +532,7 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -563,7 +563,7 @@ define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
index f5e64b3c594..86e91e04b0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -22,7 +22,7 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -44,7 +44,7 @@ define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -72,7 +72,7 @@ define void @fadd_a_a_b_f64(double addrspace(1)* %out,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -94,7 +94,7 @@ define void @fadd_b_a_a_f64(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -139,7 +139,7 @@ define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -158,7 +158,7 @@ define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
 ; GCN: v_fma_f64
-define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
                                  double addrspace(1)* %in1,
                                 double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
index b0123b86b34..bdd3c04fd31 100644
--- a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -17,7 +17,7 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
 ; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
                          <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
   %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
   %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -61,7 +61,7 @@ define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> add
 
 ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
@@ -86,7 +86,7 @@ define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> add
 
 ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
                             <2 x half> addrspace(1)* %in1,
                             <2 x half> addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
index 5423fadf81e..4ff3bbbcbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -13,41 +13,41 @@ declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
 
 
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index acd4f7ee4a8..1c0e9a2f13c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -14,7 +14,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -35,7 +35,7 @@ define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrsp
 ; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
 ; GCN: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -87,7 +87,7 @@ define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -111,7 +111,7 @@ define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -135,7 +135,7 @@ define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -164,7 +164,7 @@ define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -192,7 +192,7 @@ define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -218,7 +218,7 @@ define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -239,7 +239,7 @@ define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -282,7 +282,7 @@ define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -302,7 +302,7 @@ define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -322,7 +322,7 @@ define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -345,7 +345,7 @@ define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -368,7 +368,7 @@ define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -394,7 +394,7 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -412,7 +412,7 @@ define void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -428,7 +428,7 @@ define void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -444,7 +444,7 @@ define void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -476,7 +476,7 @@ define void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -494,7 +494,7 @@ define void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -516,7 +516,7 @@ define void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float ad
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -541,7 +541,7 @@ define void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -559,7 +559,7 @@ define void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -575,7 +575,7 @@ define void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -591,7 +591,7 @@ define void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -607,7 +607,7 @@ define void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -623,7 +623,7 @@ define void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -641,7 +641,7 @@ define void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)
 ; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -663,7 +663,7 @@ define void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float ad
 ; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
 ; GCN-NEXT: buffer_store_dword [[MAX0]]
 ; GCN-NEXT: buffer_store_dword [[MUL1]]
-define void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -693,7 +693,7 @@ define void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float
 
 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -717,7 +717,7 @@ define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -748,7 +748,7 @@ define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -776,7 +776,7 @@ define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -803,7 +803,7 @@ define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -830,7 +830,7 @@ define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -858,7 +858,7 @@ define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -886,7 +886,7 @@ define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -916,7 +916,7 @@ define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -946,7 +946,7 @@ define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -979,7 +979,7 @@ define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 
 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1009,7 +1009,7 @@ define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
 
 ; GCN: buffer_store_dword [[NEG_MAD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1035,7 +1035,7 @@ define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1051,7 +1051,7 @@ define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1070,7 +1070,7 @@ define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float a
 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: buffer_store_dword [[FNEG_A]]
-define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1090,7 +1090,7 @@ define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
-define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1110,7 +1110,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1126,7 +1126,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspac
 
 ; FIXME: Source modifiers not folded for f16->f32
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1140,7 +1140,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1162,7 +1162,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1178,7 +1178,7 @@ define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspa
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1197,7 +1197,7 @@ define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double ad
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
-define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1217,7 +1217,7 @@ define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1236,7 +1236,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1252,7 +1252,7 @@ define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1271,7 +1271,7 @@ define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addr
 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
 ; GCN: buffer_store_dword [[NEG]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1290,7 +1290,7 @@ define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1310,7 +1310,7 @@ define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1333,7 +1333,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1349,7 +1349,7 @@ define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1368,7 +1368,7 @@ define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1388,7 +1388,7 @@ define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1411,7 +1411,7 @@ define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1432,7 +1432,7 @@ define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1453,7 +1453,7 @@ define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1476,7 +1476,7 @@ define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1497,7 +1497,7 @@ define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1517,7 +1517,7 @@ define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1537,7 +1537,7 @@ define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1560,7 +1560,7 @@ define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1583,7 +1583,7 @@ define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1610,7 +1610,7 @@ define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, fl
 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1626,7 +1626,7 @@ define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1646,7 +1646,7 @@ define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1673,7 +1673,7 @@ define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.p
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1693,7 +1693,7 @@ define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.p
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1713,7 +1713,7 @@ define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1735,7 +1735,7 @@ define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1758,7 +1758,7 @@ define void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
 ; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
-define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1791,7 +1791,7 @@ define void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: buffer_store_dword [[MUL1]]
 
 ; GCN: buffer_store_dword [[MUL0]]
-define void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1827,7 +1827,7 @@ endif:
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
 ; GCN: ; use [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1856,7 +1856,7 @@ define void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
 ; GCN: ; use [[NEG]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1888,7 +1888,7 @@ define void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float
 ; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
 ; GCN-NEXT:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[FMA1]]
-define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1920,7 +1920,7 @@ define void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addr
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
 ; GCN-NEXT:	buffer_store_dword [[MUL0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1951,7 +1951,7 @@ define void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addr
 
 ; GCN:	buffer_store_dword [[FMA0]]
 ; GCN-NEXT:	buffer_store_dword [[MUL1]]
-define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1990,7 +1990,7 @@ define void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float a
 
 ; GCN: buffer_store_dword [[MUL1]]
 ; GCN-NEXT:	buffer_store_dword [[MUL2]]
-define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2025,7 +2025,7 @@ define void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, floa
 
 ; GCN: buffer_store_dwordx2 [[MUL0]]
 ; GCN: buffer_store_dwordx2 [[MUL1]]
-define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2058,7 +2058,7 @@ define void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, dou
 ; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
 ; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
 ; GCN: buffer_store_dword [[FMA0]]
-define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -2088,7 +2088,7 @@ define void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float a
 ; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
 ; GCN: buffer_store_dword [[FMA0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index beeda4cb940..555764c1551 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -9,7 +9,7 @@
 
 ; GFX89-NOT: _and
 ; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fadd = fadd half %y, %fsub
@@ -27,7 +27,7 @@ define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
 ; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
 ; GFX89-NOT: [[MUL]]
 ; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
-define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
   %fsub = fsub half -0.0, %fabs
   %fmul = fmul half %y, %fsub
@@ -41,7 +41,7 @@ define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   %fsub = fsub half -0.0, %fabs
@@ -51,7 +51,7 @@ define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
 
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
@@ -60,7 +60,7 @@ define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
   %fsub = fsub half -0.0, %fabs
@@ -76,7 +76,7 @@ define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
 ; CIVI: flat_store_dword
 
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
-define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
@@ -95,7 +95,7 @@ define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
   store <4 x half> %fsub, <4 x half> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
 
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
@@ -125,7 +125,7 @@ define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
 ; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
-define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
@@ -136,7 +136,7 @@ define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x ha
 ; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
 ; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
-define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
   %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index d16e83fd4d5..85f54403217 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fadd = fadd double %y, %fsub
@@ -14,7 +14,7 @@ define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y)
   ret void
 }
 
-define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
   %x = load double, double addrspace(1)* %xptr, align 8
   %y = load double, double addrspace(1)* %xptr, align 8
   %fabs = call double @llvm.fabs.f64(double %x)
@@ -26,7 +26,7 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fmul = fmul double %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y)
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f64:
-define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -46,7 +46,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
 ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -62,7 +62,7 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
@@ -74,7 +74,7 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
 ; GCN-NOT: 0x80000000
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
   store <2 x double> %fsub, <2 x double> addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
   store <4 x double> %fsub, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index 9ee1171306c..a0cf37b159d 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
 ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fadd = fadd float %y, %fsub
@@ -17,7 +17,7 @@ define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
 ; SI-NOT: and
 ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
 ; SI-NOT: and
-define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fmul = fmul float %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -49,7 +49,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -59,7 +59,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs
   store float %fsub, float addrspace(1)* %out, align 4
@@ -68,7 +68,7 @@ define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %val = load float, float addrspace(1)* %in, align 4
   %fabs = call float @llvm.fabs.f32(float %val)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -86,7 +86,7 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
   store <2 x float> %fsub, <2 x float> addrspace(1)* %out
@@ -99,7 +99,7 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
   store <4 x float> %fsub, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index d0c6d3d5155..07a23cb7e16 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -4,7 +4,7 @@
 
 ; FIXME: Should be able to do scalar op
 ; GCN-LABEL: {{^}}s_fneg_f16:
-define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
+define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
   %fneg = fsub half -0.0, %in
   store half %fneg, half addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
 ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
 ; SI: buffer_store_short [[XOR]]
-define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
@@ -34,7 +34,7 @@ define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc
   store half %fsub, half addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
 
 ; VI-NOT: [[NEG_VALUE]]
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %fsub = fsub half -0.0, %val
   %fmul = fmul half %fsub, %val
@@ -73,7 +73,7 @@ define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0
 ; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
 
 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
-define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
   store <2 x half> %fneg, <2 x half> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
 ; GCN-LABEL: {{^}}v_fneg_v2f16:
 ; GCN: flat_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
-define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
@@ -98,7 +98,7 @@ define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)
 
 ; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
-define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
   %bc = bitcast i32 %in to <2 x half>
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
   store <2 x half> %fsub, <2 x half> addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
-define void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
   %fmul = fmul <2 x half> %fsub, %val
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
index b7080f4622a..9b4b4d6e942 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_f64:
 ; GCN: v_xor_b32
-define void @fneg_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fneg_v2f64:
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
   store double %fsub, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index 007c6dcadd9..d1eabfb13c9 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -6,7 +6,7 @@
 ; R600: -PV
 
 ; GCN: v_xor_b32
-define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float>
 
 ; R600-NOT: XOR
 ; R600: -KC0[2].Z
-define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
 
 ; R600-NOT: XOR
 ; R600: -PV.W
-define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float -0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
 ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
   store float %fmul, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index b7ffaed70c5..cbc42979f2e 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -9,7 +9,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -20,7 +20,7 @@ define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
 ; SI-LABEL: {{^}}test_not_isinf_pattern_0:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -31,7 +31,7 @@ define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x
 ; SI-LABEL: {{^}}test_not_isinf_pattern_1:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
   %ext = zext i1 %cmp to i32
@@ -45,7 +45,7 @@ define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -59,7 +59,7 @@ define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x)
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
@@ -73,7 +73,7 @@ define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %ninf = fcmp une float %x, 0x7FF0000000000000
   %and = and i1 %ord, %ninf
@@ -86,7 +86,7 @@ define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -100,7 +100,7 @@ define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp uno float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -114,7 +114,7 @@ define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 01bc53ff35a..ce041364b76 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -14,7 +14,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
 ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
 ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index a9f493bf0cc..70f0c0c1afd 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 3e426e3e94b..2c6b1cb18f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -12,7 +12,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 ; EG: MEM_RAT MSKOR
 ; EG: VTX_READ_32
 ; EG: FLT32_TO_FLT16
-define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 1537d67cadc..a7cddd09b76 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
   %result = fptosi double %in to i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %result = fptosi <2 x double> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %result = fptosi <4 x double> %in to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -58,7 +58,7 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 
 ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptosi double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptosi double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
index a2fa7a19074..630a7186e10 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -8,7 +8,7 @@ declare float @llvm.fabs.f32(float) #1
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -76,7 +76,7 @@ define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac
 
 ; Check that the compiler doesn't crash with a "cannot select" error
 ; SI: s_endpgm
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
 entry:
   %0 = fptosi float %in to i64
   store i64 %0, i64 addrspace(1)* %out
@@ -128,7 +128,7 @@ entry:
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptosi <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -221,7 +221,7 @@ define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
 ; EG-NEXT: -1082130432(-1.000000e+00)
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptosi float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -251,7 +251,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
 ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
   %sint = fptosi float %in to i16
   store i16 %sint, i16 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index d5bc416434d..4f597eb3f32 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
 ; SI-LABEL: @fp_to_uint_v2i32_v2f64
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i32>
   store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i32>
   store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -57,14 +57,14 @@ define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 }
 
 ; SI-LABEL: @fp_to_uint_v2i64_v2f64
-define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i64>
   store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: @fp_to_uint_v4i64_v4f64
-define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i64>
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
@@ -72,7 +72,7 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptoui double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -80,7 +80,7 @@ define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptoui double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
index cbff9f22b07..fdb15801dc4 100644
--- a/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
   %conv = fptoui float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
-define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -37,7 +37,7 @@ define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float>
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 
-define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
   %conv = fptoui float %x to i64
   store i64 %conv, i64 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptoui <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -212,7 +212,7 @@ define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float>
 
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptoui float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptoui float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -246,7 +246,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
   %uint = fptoui float %in to i16
   store i16 %uint, i16 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 1f0cc1511d4..4b03ada925e 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_dword v[[R_F32]]
 ; GCN: s_endpgm
-define void @fpext_f16_to_f32(
+define amdgpu_kernel void @fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) #0 {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_f16_to_f64(
+define amdgpu_kernel void @fpext_f16_to_f64(
     double addrspace(1)* %r,
     half addrspace(1)* %a) #0 {
 entry:
@@ -41,7 +41,7 @@ entry:
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f32(
+define amdgpu_kernel void @fpext_v2f16_to_v2f32(
     <2 x float> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) #0 {
 entry:
@@ -61,7 +61,7 @@ entry:
 ; GCN: v_cvt_f64_f32_e32
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f64(
+define amdgpu_kernel void @fpext_v2f16_to_v2f64(
     <2 x double> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -73,7 +73,7 @@ entry:
 
 ; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32:
 ; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
-define void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
+define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
 entry:
   %a.trunc = trunc i32 %a to i16
   %a.val = bitcast i16 %a.trunc to half
@@ -85,7 +85,7 @@ entry:
 ; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]]
-define void @fneg_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -99,7 +99,7 @@ entry:
 ; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]|
-define void @fabs_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -113,7 +113,7 @@ entry:
 ; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32:
 ; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]|
-define void @fneg_fabs_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -135,7 +135,7 @@ entry:
 
 ; GCN: store_dword [[CVT]]
 ; GCN: store_short [[XOR]]
-define void @fneg_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -159,7 +159,7 @@ entry:
 
 ; GCN: buffer_store_dword [[CVTA_NEG]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fneg_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -181,7 +181,7 @@ entry:
 
 ; GCN: store_dword [[CVT]]
 ; GCN: store_short [[XOR]]
-define void @fabs_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -205,7 +205,7 @@ entry:
 
 ; GCN: buffer_store_dword [[ABS_A]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fabs_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -227,7 +227,7 @@ entry:
 
 ; GCN: buffer_store_dword [[CVT]]
 ; GCN: buffer_store_short [[OR]]
-define void @fabs_fneg_multi_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -252,7 +252,7 @@ entry:
 
 ; GCN: buffer_store_dword [[FABS_FNEG]]
 ; GCN: buffer_store_short [[MUL]]
-define void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
+define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
     float addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.ll b/llvm/test/CodeGen/AMDGPU/fpext.ll
index 6dc84b01d73..b11e2ea056c 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
 ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
   %result = fpext <2 x float> %in to <2 x double>
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
   %result = fpext <3 x float> %in to <3 x double>
   store <3 x double> %result, <3 x double> addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
   %result = fpext <4 x float> %in to <4 x double>
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
   %result = fpext <8 x float> %in to <8 x double>
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 71f56d730e9..2c25a221a66 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i16(
+define amdgpu_kernel void @fptosi_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i32(
+define amdgpu_kernel void @fptosi_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -40,7 +40,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i64(
+define amdgpu_kernel void @fptosi_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -62,7 +62,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_I16]]
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i16(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -80,7 +80,7 @@ entry:
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i32(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -98,7 +98,7 @@ entry:
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i64(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index a6876624a0c..10f59c8e782 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -8,7 +8,7 @@
 ; VI:  v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i16(
+define amdgpu_kernel void @fptoui_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i32(
+define amdgpu_kernel void @fptoui_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -41,7 +41,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i64(
+define amdgpu_kernel void @fptoui_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -64,7 +64,7 @@ entry:
 ; GCN:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
 ; GCN:     buffer_store_dword v[[R_V2_I16]]
 ; GCN:     s_endpgm
-define void @fptoui_v2f16_to_v2i16(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -82,7 +82,7 @@ entry:
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i32(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -100,7 +100,7 @@ entry:
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i64(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 0719486af8e..ef6d778bd97 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f32_to_f16(
+define amdgpu_kernel void @fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f64_to_f16(
+define amdgpu_kernel void @fptrunc_f64_to_f16(
     half addrspace(1)* %r,
     double addrspace(1)* %a) {
 entry:
@@ -48,7 +48,7 @@ entry:
 
 ; GCN:     buffer_store_dword v[[R_V2_F16]]
 ; GCN:     s_endpgm
-define void @fptrunc_v2f32_to_v2f16(
+define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x float> addrspace(1)* %a) {
 entry:
@@ -74,7 +74,7 @@ entry:
 ; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-define void @fptrunc_v2f64_to_v2f16(
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x double> addrspace(1)* %a) {
 entry:
@@ -89,7 +89,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fneg_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -105,7 +105,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fabs_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -121,7 +121,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fneg_fabs_fptrunc_f32_to_f16(
+define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 0c7b67406a8..d9c5b7e6f35 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
 ; GCN-NOT: v_cvt
 ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
 ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]]
-define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, i16 addrspace(1)* %out
@@ -24,7 +24,7 @@ define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
   %result = fptrunc <2 x double> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -35,7 +35,7 @@ define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
   %result = fptrunc <4 x double> %in to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
   %result = fptrunc <8 x double> %in to <8 x float>
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index 0651dce8d95..7a5bcfffa3f 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -27,7 +27,7 @@ declare double @llvm.floor.f64(double) #0
 ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
@@ -54,7 +54,7 @@ define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %neg.x = fsub double -0.0, %x
   %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
@@ -82,7 +82,7 @@ define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src)
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %abs.x = call double @llvm.fabs.f64(double %x)
   %neg.abs.x = fsub double -0.0, %abs.x
@@ -98,7 +98,7 @@ define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %
 ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
-define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index 4e1a503b129..207fe280c9a 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -14,7 +14,7 @@ declare float @llvm.floor.f32(float) #0
 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
@@ -29,7 +29,7 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %x.neg = fsub float -0.0, %x
   %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
@@ -45,7 +45,7 @@ define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %abs.x = call float @llvm.fabs.f32(float %x)
   %neg.abs.x = fsub float -0.0, %abs.x
@@ -61,7 +61,7 @@ define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %sr
 
 ; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
 ; GCN-UNSAFE: buffer_store_dword [[FRACT]]
-define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 58b8308bc78..9778069d047 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -15,7 +15,7 @@
 ; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -33,7 +33,7 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
 ; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -54,7 +54,7 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; GCN: v_add_f64
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -70,7 +70,7 @@ define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CI: v_trunc_f64_e32
 ; GCN: v_fma_f64
 ; GCN: s_endpgm
-define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -79,7 +79,7 @@ define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in
    ret void
 }
 
-define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
    %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
@@ -89,7 +89,7 @@ define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
    %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
@@ -99,7 +99,7 @@ define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index ed040436a61..453d8fb37f2 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %i
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
index b6526b8e078..a0fd3411ca0 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -16,7 +16,7 @@ define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %i
 
 ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %fdiv = call float @llvm.sqrt.f32(float %in)
   store float %fdiv, float addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
   store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_neg0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, -0.000000e+00
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_pos0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, 0.000000e+00
@@ -99,7 +99,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_ult:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
@@ -112,7 +112,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
@@ -125,7 +125,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
index 178bf80d830..972a8bb9665 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -12,7 +12,7 @@
 ; GFX89:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16(
+define amdgpu_kernel void @fsub_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -32,7 +32,7 @@ entry:
 ; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_a(
+define amdgpu_kernel void @fsub_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@ entry:
 ; GFX89:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_b(
+define amdgpu_kernel void @fsub_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -89,7 +89,7 @@ entry:
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16(
+define amdgpu_kernel void @fsub_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -123,7 +123,7 @@ entry:
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_a(
+define amdgpu_kernel void @fsub_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -154,7 +154,7 @@ entry:
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_b(
+define amdgpu_kernel void @fsub_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fsub.ll b/llvm/test/CodeGen/AMDGPU/fsub.ll
index de3ab69448b..e7a92d95d48 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -17,7 +17,7 @@ define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
 
 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
   %sub = fsub float %a, %b
   store float %sub, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
 
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -45,7 +45,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -60,7 +60,7 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
-define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@ define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f
 ; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -82,7 +82,7 @@ define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -95,7 +95,7 @@ define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -111,7 +111,7 @@ define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrs
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -123,7 +123,7 @@ define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float
 
 ; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32:
 ; SI-NOT: v_sub
-define void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %a = load float, float addrspace(1)* %in, align 4
   %result = fsub float %a, 0.0
   store float %result, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fsub64.ll b/llvm/test/CodeGen/AMDGPU/fsub64.ll
index 4c9c5ddd4c6..1b0879d098e 100644
--- a/llvm/test/CodeGen/AMDGPU/fsub64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsub64.ll
@@ -5,7 +5,7 @@ declare double @llvm.fabs.f64(double) #0
 
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -16,7 +16,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
-define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -28,7 +28,7 @@ define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                                double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -40,7 +40,7 @@ define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %
 
 ; SI-LABEL: {{^}}s_fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, %b
   store double %sub, double addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
-define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double 4.0, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -56,7 +56,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
-define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, 4.0
   store double %sub, double addrspace(1)* %out
   ret void
@@ -64,7 +64,7 @@ define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b)
 
 ; SI-LABEL: {{^}}s_fsub_self_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
   %sub = fsub double %a, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -73,7 +73,7 @@ define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
 ; SI-LABEL: {{^}}fsub_v2f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
   %sub = fsub <2 x double> %a, %b
   store <2 x double> %sub, <2 x double> addrspace(1)* %out
   ret void
@@ -84,7 +84,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x d
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %a = load <4 x double>, <4 x double> addrspace(1)* %in
   %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
@@ -98,7 +98,7 @@ define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
   %result = fsub <4 x double> %a, %b
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
index c4138ad79c2..1f72ec65588 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -13,7 +13,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 ; CI: v_trunc_f64
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
 ; SI: s_endpgm
-define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %x = load double, double addrspace(1)* %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out, align 8
@@ -36,7 +36,7 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
 ; SI: s_endpgm
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@ define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ftrunc_v2f64:
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -55,7 +55,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -66,7 +66,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/ftrunc.ll b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
index d0718394e7f..b5ad01eaeaf 100644
--- a/llvm/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/ftrunc.ll
@@ -12,7 +12,7 @@ declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 ; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
-define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
-; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
@@ -51,7 +51,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@ define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -113,7 +113,7 @@ define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index f96463613e8..7fb47e08ea5 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
-define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
@@ -17,7 +17,7 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
@@ -39,7 +39,7 @@ define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %arra
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -60,7 +60,7 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 5a18d425d50..80acfcca708 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -26,7 +26,7 @@
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4
 
-define void @private_test(i32 %index, float addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
   store float %val, float addrspace(1)* %out
@@ -40,7 +40,7 @@ define void @private_test(i32 %index, float addrspace(1)* %out) {
 ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(2)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-directive.ll b/llvm/test/CodeGen/AMDGPU/global-directive.ll
index 450b7d36742..ce89e390eac 100644
--- a/llvm/test/CodeGen/AMDGPU/global-directive.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-directive.ll
@@ -5,7 +5,7 @@
 
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
-define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
index 2c7c02de167..19e592f50be 100644
--- a/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_load_ushort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; SI: buffer_load_sshort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -146,7 +146,7 @@ define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -195,7 +195,7 @@ define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -204,7 +204,7 @@ define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -222,7 +222,7 @@ define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -258,7 +258,7 @@ define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -276,7 +276,7 @@ define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -285,7 +285,7 @@ define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 00be6e4d5c1..ae6dd54fec6 100644
--- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -19,7 +19,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @private_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -33,7 +33,7 @@ define void @private_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @internal_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @internal_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -67,7 +67,7 @@ define void @available_externally_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @linkonce_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @common_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -118,7 +118,7 @@ define void @common_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @extern_weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @extern_weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @linkonce_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @weak_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @external_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_w_init_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 909ceb5546c..6928bede547 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -25,7 +25,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
 ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -36,7 +36,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -47,7 +47,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -59,7 +59,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -70,7 +70,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -79,7 +79,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -89,7 +89,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -100,7 +100,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -110,7 +110,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -120,7 +120,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -131,7 +131,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -143,7 +143,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -154,7 +154,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -163,7 +163,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -173,7 +173,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -184,7 +184,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -204,7 +204,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -215,7 +215,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -227,7 +227,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -238,7 +238,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -247,7 +247,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -257,7 +257,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -278,7 +278,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -288,7 +288,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -299,7 +299,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -311,7 +311,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -322,7 +322,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -341,7 +341,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -352,7 +352,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -362,7 +362,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -372,7 +372,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -383,7 +383,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -395,7 +395,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -406,7 +406,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -415,7 +415,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -425,7 +425,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -436,7 +436,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -446,7 +446,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -456,7 +456,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -467,7 +467,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -479,7 +479,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -490,7 +490,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -499,7 +499,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -509,7 +509,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -520,7 +520,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -530,7 +530,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -540,7 +540,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -551,7 +551,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -563,7 +563,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -574,7 +574,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -583,7 +583,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -593,7 +593,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -604,7 +604,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -614,7 +614,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -624,7 +624,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -635,7 +635,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -647,7 +647,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -658,7 +658,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -667,7 +667,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -677,7 +677,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -688,7 +688,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -698,7 +698,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -708,7 +708,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -720,7 +720,7 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -733,7 +733,7 @@ entry:
 
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -744,7 +744,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -753,7 +753,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -763,7 +763,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -774,7 +774,7 @@ entry:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -784,7 +784,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -794,7 +794,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -807,7 +807,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -819,7 +819,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -831,7 +831,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -840,7 +840,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
@@ -851,7 +851,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -862,7 +862,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -873,7 +873,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -883,7 +883,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -894,7 +894,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -906,7 +906,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -917,7 +917,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -926,7 +926,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -936,7 +936,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -947,7 +947,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -959,7 +959,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
   %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -971,7 +971,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(1)* %out
@@ -982,7 +982,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -995,7 +995,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -1006,7 +1006,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -1016,7 +1016,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -1025,7 +1025,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -1036,7 +1036,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f66c6c7b531..56520b787ea 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -24,7 +24,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -36,7 +36,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -47,7 +47,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -56,7 +56,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -66,7 +66,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -77,7 +77,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -87,7 +87,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -97,7 +97,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -108,7 +108,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -120,7 +120,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -131,7 +131,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -140,7 +140,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -150,7 +150,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -161,7 +161,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -171,7 +171,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -181,7 +181,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -192,7 +192,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -204,7 +204,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -215,7 +215,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -224,7 +224,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -234,7 +234,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -245,7 +245,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -255,7 +255,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -265,7 +265,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -276,7 +276,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -288,7 +288,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -299,7 +299,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -308,7 +308,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -329,7 +329,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -339,7 +339,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -349,7 +349,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -360,7 +360,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -372,7 +372,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -392,7 +392,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -402,7 +402,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -433,7 +433,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -444,7 +444,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -456,7 +456,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -467,7 +467,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -476,7 +476,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -486,7 +486,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -497,7 +497,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -507,7 +507,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -517,7 +517,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -528,7 +528,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -540,7 +540,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -551,7 +551,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -560,7 +560,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -570,7 +570,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -581,7 +581,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -591,7 +591,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -601,7 +601,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -612,7 +612,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -624,7 +624,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -635,7 +635,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -644,7 +644,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -654,7 +654,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -665,7 +665,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -675,7 +675,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -685,7 +685,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -696,7 +696,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -708,7 +708,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -719,7 +719,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -728,7 +728,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -738,7 +738,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -749,7 +749,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -759,7 +759,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -769,7 +769,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -780,7 +780,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -792,7 +792,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -803,7 +803,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -812,7 +812,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -822,7 +822,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -833,7 +833,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -851,7 +851,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -861,7 +861,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -871,7 +871,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -884,7 +884,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -896,7 +896,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -908,7 +908,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -917,7 +917,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -928,7 +928,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -939,7 +939,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -952,7 +952,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
   %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -964,7 +964,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(1)* %out
@@ -975,7 +975,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -988,7 +988,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
@@ -999,7 +999,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_offset:
 ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -1009,7 +1009,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
   ret void
@@ -1018,7 +1018,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -1029,7 +1029,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d07843e9dd2..0903542bac4 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -15,7 +15,7 @@
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @float(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
   %1 = load float, float addrspace(2)* %0
@@ -33,7 +33,7 @@ entry:
 ; EG: @i32_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @i32(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
   %1 = load i32, i32 addrspace(2)* %0
@@ -53,7 +53,7 @@ entry:
 ; EG: @struct_foo_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i32, i32 addrspace(2)* %gep, align 4
   store i32 %load, i32 addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 ; EG: @array_v1_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
 ; EG: VTX_READ_32
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
-define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
index 2b5af75936f..e641d7266a7 100644
--- a/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -13,7 +13,7 @@
 
 ; CHECK-LABEL: lds_no_offset:
 ; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define void @lds_no_offset() {
+define amdgpu_kernel void @lds_no_offset() {
 entry:
   %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
   store i32 0, i32 addrspace(3)* %ptr
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 4b4895028c9..82a6c1044d5 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -8,7 +8,7 @@
 ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; GCN: buffer_store_short [[CVT]]
-define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
   store half %arg, half addrspace(1)* %out
   ret void
 }
@@ -20,7 +20,7 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
-define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -34,7 +34,7 @@ define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
 ; GCN-DAG: buffer_store_short
 ; GCN-NOT: buffer_store
 ; GCN: s_endpgm
-define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
   store <3 x half> %arg, <3 x half> addrspace(1)* %out
   ret void
 }
@@ -46,33 +46,33 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_v8f16_arg:
-define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
   store <8 x half> %arg, <8 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_arg:
-define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
   %fpext = fpext <2 x half> %in to <2 x float>
   store <2 x float> %fpext, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
-define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to float
   store float %ext, float addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
-define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x float>
   store <2 x float> %ext, <2 x float> addrspace(1)* %out
   ret void
@@ -90,14 +90,14 @@ define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x hal
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x float>
   store <3 x float> %ext, <3 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
-define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x float>
   store <4 x float> %ext, <4 x float> addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
   ret void
@@ -138,7 +138,7 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal
 ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
 ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
@@ -152,7 +152,7 @@ define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 
 ; GCN: s_endpgm
-define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
   ret void
@@ -236,7 +236,7 @@ define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x ha
 ; GCN-LABEL: {{^}}global_load_store_f16:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@ define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %
 ; GCN-LABEL: {{^}}global_load_store_v2f16:
 ; GCN: buffer_load_dword [[TMP:v[0-9]+]]
 ; GCN: buffer_store_dword [[TMP]]
-define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   store <2 x half> %val, <2 x half> addrspace(1)* %out
   ret void
@@ -254,7 +254,7 @@ define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GCN-LABEL: {{^}}global_load_store_v4f16:
 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[TMP]]
-define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   store <4 x half> %val, <4 x half> addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@ define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> ad
 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_endpgm
-define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   store <8 x half> %val, <8 x half> addrspace(1)* %out
   ret void
@@ -274,7 +274,7 @@ define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> a
 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to float
   store float %cvt, float addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
 ; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
-define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
-define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
-define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out
@@ -347,7 +347,7 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: buffer_store_dwordx4
 
 ; GCN: s_endpgm
-define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
   store <16 x float> %cvt, <16 x float> addrspace(1)* %out
@@ -359,7 +359,7 @@ define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <1
 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
 ; GCN: buffer_store_dwordx2 [[CVT1]]
-define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to double
   store double %cvt, double addrspace(1)* %out
@@ -375,7 +375,7 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>
   store <2 x double> %cvt, <2 x double> addrspace(1)* %out
@@ -413,7 +413,7 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN: s_endpgm
-define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x double>
   store <3 x double> %cvt, <3 x double> addrspace(1)* %out
@@ -421,7 +421,7 @@ define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
-define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x double>
   store <4 x double> %cvt, <4 x double> addrspace(1)* %out
@@ -429,7 +429,7 @@ define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
-define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x double>
   store <8 x double> %cvt, <8 x double> addrspace(1)* %out
@@ -437,7 +437,7 @@ define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
-define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x double>
   store <16 x double> %cvt, <16 x double> addrspace(1)* %out
@@ -448,7 +448,7 @@ define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_short [[CVT]]
-define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %val = load float, float addrspace(1)* %in
   %cvt = fptrunc float %val to half
   store half %cvt, half addrspace(1)* %out
@@ -463,7 +463,7 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
 ; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
-define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %val = load <2 x float>, <2 x float> addrspace(1)* %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -479,7 +479,7 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2
 ; GCN: buffer_store_short
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %val = load <3 x float>, <3 x float> addrspace(1)* %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -494,7 +494,7 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -514,7 +514,7 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
 ; GCN: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
   %val = load <8 x float>, <8 x float> addrspace(1)* %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -545,7 +545,7 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
@@ -560,7 +560,7 @@ define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out,
 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
    %add = fadd half %a, %b
    store half %add, half addrspace(1)* %out, align 4
    ret void
@@ -570,7 +570,7 @@ define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
   %add = fadd <2 x half> %a, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
   ret void
@@ -582,7 +582,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half>
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
   %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
   %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
@@ -601,7 +601,7 @@ define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)*
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
   %add = fadd <8 x half> %a, %b
   store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
   ret void
@@ -610,7 +610,7 @@ define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half>
 ; GCN-LABEL: {{^}}test_bitcast_from_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
   %val = load half, half addrspace(1)* %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
@@ -620,7 +620,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_bitcast_to_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
index 631d6def444..45efe9b8655 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -4,7 +4,7 @@
 ; unsupported device.
 
 ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index ecb825a2f20..b1901cf894b 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -4,7 +4,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -14,7 +14,7 @@ define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -24,7 +24,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -34,7 +34,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 48
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -44,7 +44,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -54,7 +54,7 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN: float_mode = 0
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -64,7 +64,7 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 0
 ; GCN: enable_ieee_mode = 1
-define void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
index 2820b308edb..2ec57a40f0a 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -9,7 +9,7 @@
 @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
 @external_readonly = unnamed_addr addrspace(2) constant i32 0
 
-define void @test() {
+define amdgpu_kernel void @test() {
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
index 1999dc38a6b..600793810e5 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -3,7 +3,7 @@
 @internal_group = internal addrspace(3) global i32 undef
 @external_group = addrspace(3) global i32 undef
 
-define void @test() {
+define amdgpu_kernel void @test() {
 entry:
   store i32 0, i32 addrspace(3)* @internal_group
   store i32 0, i32 addrspace(3)* @external_group
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index e85db65e742..f6bf0b09486 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -5,7 +5,7 @@
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
 ; SI-NEXT: s_cbranch_scc1
-define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
index d4912776deb..b160af86a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,7 +10,7 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg) {
+define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.r600.read.tidig.x() #0
   %cmp = trunc i32 %tidig to i1
diff --git a/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
index c218e1918bb..d501be5c8bf 100644
--- a/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8, i8 addrspace(1)* %in
   %2 = uitofp i8 %1 to double
   %3 = fptrunc double %2 to float
diff --git a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
index 60e59a5a528..12cc440e48d 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -6,7 +6,7 @@
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
index c3dad2d3203..99c2138bbe6 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}i16_eq:
 ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne:
 ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -44,7 +44,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt:
 ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -62,7 +62,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge:
 ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -80,7 +80,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult:
 ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -98,7 +98,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule:
 ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -117,7 +117,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt:
 ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -135,7 +135,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge:
 ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt:
 ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -171,7 +171,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle:
 ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_eq_v_s:
 ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -206,7 +206,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne_v_s:
 ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -222,7 +222,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt_v_s:
 ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -238,7 +238,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge_v_s:
 ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -254,7 +254,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult_v_s:
 ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -270,7 +270,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule_v_s:
 ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -286,7 +286,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt_v_s:
 ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -302,7 +302,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge_v_s:
 ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt_v_s:
 ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -334,7 +334,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle_v_s:
 ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/icmp64.ll b/llvm/test/CodeGen/AMDGPU/icmp64.ll
index 33ad0c9199b..3af74277df1 100644
--- a/llvm/test/CodeGen/AMDGPU/icmp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/icmp64.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_u64
-define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@ define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ne:
 ; SI: v_cmp_ne_u64
-define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -21,7 +21,7 @@ define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_slt:
 ; SI: v_cmp_lt_i64
-define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -30,7 +30,7 @@ define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ult:
 ; SI: v_cmp_lt_u64
-define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -39,7 +39,7 @@ define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sle:
 ; SI: v_cmp_le_i64
-define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ule:
 ; SI: v_cmp_le_u64
-define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sgt:
 ; SI: v_cmp_gt_i64
-define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -66,7 +66,7 @@ define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ugt:
 ; SI: v_cmp_gt_u64
-define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sge:
 ; SI: v_cmp_ge_i64
-define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_uge:
 ; SI: v_cmp_ge_u64
-define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/image-attributes.ll b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
index 5906b2f1570..53d61e66c6b 100644
--- a/llvm/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-attributes.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -20,7 +20,7 @@ entry:
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -37,7 +37,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -50,7 +50,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -67,7 +67,7 @@ entry:
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].X
-define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -84,7 +84,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -97,7 +97,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -114,7 +114,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[4].Z
-define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
                             i32 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
index d4cf3494424..dac7c7ddaea 100644
--- a/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -7,7 +7,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -21,7 +21,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -37,7 +37,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -51,7 +51,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -67,7 +67,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -82,7 +82,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -97,7 +97,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -112,7 +112,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -129,7 +129,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -159,7 +159,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -174,7 +174,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -191,7 +191,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -208,7 +208,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -226,7 +226,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -243,7 +243,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -261,7 +261,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -277,7 +277,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -293,7 +293,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
@@ -309,7 +309,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index b2c873ab73a..c2668a077b0 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
 ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
-define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
   ret void
@@ -15,7 +15,7 @@ entry:
 ; GCN-LABEL: {{^}}i64_imm_inline_hi:
 ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
-define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
@@ -25,7 +25,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
   ret void
 }
@@ -33,7 +33,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
   ret void
 }
@@ -57,7 +57,7 @@ define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
@@ -65,7 +65,7 @@ define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
@@ -73,7 +73,7 @@ define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
@@ -81,7 +81,7 @@ define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
@@ -89,7 +89,7 @@ define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
@@ -97,7 +97,7 @@ define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
@@ -105,7 +105,7 @@ define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
@@ -113,7 +113,7 @@ define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
 }
@@ -123,7 +123,7 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0x3FC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -131,7 +131,7 @@ define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0xBFC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -139,7 +139,7 @@ define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
@@ -148,7 +148,7 @@ define void @store_literal_imm_f32(float addrspace(1)* %out) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -168,7 +168,7 @@ define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -188,7 +188,7 @@ define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@ define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -208,7 +208,7 @@ define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -218,7 +218,7 @@ define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@ define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -238,7 +238,7 @@ define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addr
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
@@ -260,7 +260,7 @@ define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36a0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -270,7 +270,7 @@ define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36b0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@ define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36e0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -290,7 +290,7 @@ define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffe0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -300,7 +300,7 @@ define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffc0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -310,7 +310,7 @@ define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xfffffffe00000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -320,7 +320,7 @@ define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36ff800000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -330,7 +330,7 @@ define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x3700000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -342,7 +342,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -364,7 +364,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -375,7 +375,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -397,7 +397,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -408,7 +408,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -419,7 +419,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -430,7 +430,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -445,7 +445,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -455,7 +455,7 @@ define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -466,7 +466,7 @@ define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
@@ -477,7 +477,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
@@ -488,7 +488,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -499,7 +499,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -510,7 +510,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -521,7 +521,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
@@ -532,7 +532,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
@@ -543,7 +543,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
@@ -554,7 +554,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
   ret void
 }
@@ -564,7 +564,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
   ret void
 }
@@ -573,7 +573,7 @@ define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
   store double 0.5, double addrspace(1)* %out
   ret void
 }
@@ -582,7 +582,7 @@ define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
   store double -0.5, double addrspace(1)* %out
   ret void
 }
@@ -591,7 +591,7 @@ define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
   store double 1.0, double addrspace(1)* %out
   ret void
 }
@@ -600,7 +600,7 @@ define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
   store double -1.0, double addrspace(1)* %out
   ret void
 }
@@ -609,7 +609,7 @@ define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
   store double 2.0, double addrspace(1)* %out
   ret void
 }
@@ -618,7 +618,7 @@ define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
   store double -2.0, double addrspace(1)* %out
   ret void
 }
@@ -627,7 +627,7 @@ define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
   store double 4.0, double addrspace(1)* %out
   ret void
 }
@@ -636,7 +636,7 @@ define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
   store double -4.0, double addrspace(1)* %out
   ret void
 }
@@ -645,7 +645,7 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0x3fc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -654,7 +654,7 @@ define void @store_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0xbfc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -663,7 +663,7 @@ define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
   store double 4096.0, double addrspace(1)* %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll
index 2e73eb06502..e42d5879189 100644
--- a/llvm/test/CodeGen/AMDGPU/imm16.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm16.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
   store volatile i16 -32768, i16 addrspace(1)* %out
   ret void
 }
@@ -15,7 +15,7 @@ define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
   store half 0.0, half addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
   store half -0.0, half addrspace(1)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
   store half 0.5, half addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
   store half -0.5, half addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
   store half 1.0, half addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
   store half -1.0, half addrspace(1)* %out
   ret void
 }
@@ -66,7 +66,7 @@ define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
   store half 2.0, half addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
   store half -2.0, half addrspace(1)* %out
   ret void
 }
@@ -83,7 +83,7 @@ define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
   store half 4.0, half addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
   store half -4.0, half addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xH3118, half addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@ define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xHB118, half addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
 ; GCN: buffer_store_short [[REG]]
-define void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
   store half 4096.0, half addrspace(1)* %out
   ret void
 }
@@ -127,7 +127,7 @@ define void @store_literal_imm_f16(half addrspace(1)* %out) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -137,7 +137,7 @@ define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@ define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@ define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -167,7 +167,7 @@ define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@ define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -207,7 +207,7 @@ define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -217,7 +217,7 @@ define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
@@ -228,7 +228,7 @@ define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrsp
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 1024.0
   store half %y, half addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0001
   store half %y, half addrspace(1)* %out
   ret void
@@ -249,7 +249,7 @@ define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0002
   store half %y, half addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@ define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0010
   store half %y, half addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@ define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFF
   store half %y, half addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@ define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFE
   store half %y, half addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@ define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFF0
   store half %y, half addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH003F
   store half %y, half addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@ define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0040
   store half %y, half addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll
index c1ed3952cba..6ba502a9fb1 100644
--- a/llvm/test/CodeGen/AMDGPU/immv216.ll
+++ b/llvm/test/CodeGen/AMDGPU/immv216.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
   store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
   ret void
 }
@@ -14,7 +14,7 @@ define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -22,7 +22,7 @@ define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -30,7 +30,7 @@ define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -38,7 +38,7 @@ define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -46,7 +46,7 @@ define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -54,7 +54,7 @@ define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -62,7 +62,7 @@ define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -70,7 +70,7 @@ define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -78,7 +78,7 @@ define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -86,7 +86,7 @@ define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -94,7 +94,7 @@ define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -102,7 +102,7 @@ define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@ define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0
 ; GCN-LABEL: {{^}}store_literal_imm_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
   store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@ define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.0, half 0.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -143,7 +143,7 @@ define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -160,7 +160,7 @@ define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -0.5, half -0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@ define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x ha
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 1.0, half 1.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -194,7 +194,7 @@ define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -1.0, half -1.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@ define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x ha
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 2.0, half 2.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@ define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -2.0, half -2.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@ define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x ha
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 4.0, half 4.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -262,7 +262,7 @@ define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half -4.0, half -4.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@ define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x ha
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %x = load <2 x half>, <2 x half> addrspace(1)* %in
   %y = fadd <2 x half> %x, <half 0.5, half 0.5>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
@@ -301,7 +301,7 @@ define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %x = load <2 x half>, <2 x half> addrspace(1)* %in
   %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
@@ -319,7 +319,7 @@ define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -336,7 +336,7 @@ define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -370,7 +370,7 @@ define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFFF, half 0xHFFFF>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -387,7 +387,7 @@ define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFFE, half 0xHFFFE>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -404,7 +404,7 @@ define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xHFFF0, half 0xHFFF0>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -421,7 +421,7 @@ define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x hal
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
@@ -438,7 +438,7 @@ define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
 ; VI: v_or_b32
 ; VI: buffer_store_dword
-define void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
   %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
   store <2 x half> %y, <2 x half> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index 877956be308..8e207a38c84 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -10,7 +10,7 @@
 ; CHECK: s_mov_b32 m0, [[IN]]
 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
 ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %ins, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index f83363de3aa..b18ae353ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -19,7 +19,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
@@ -44,7 +44,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
   %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
@@ -66,7 +66,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
   store float %elt, float addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
@@ -105,7 +105,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %or = or <4 x i32> %vec0, %vec1
@@ -137,7 +137,7 @@ entry:
 
 ; IDXMODE: s_set_gpr_idx_off
 ; GCN: buffer_store_dword [[RESULT]]
-define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -147,7 +147,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
-define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = extractelement <4 x i32> %ld, i32 undef
@@ -159,7 +159,7 @@ entry:
 ; GCN-DAG: buffer_load_dwordx4
 ; MOVREL-DAG: s_mov_b32 m0,
 ; MOVREL: v_movreld_b32
-define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
@@ -178,7 +178,7 @@ entry:
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
-define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
@@ -197,7 +197,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %0, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
@@ -233,7 +233,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> %vec, i32 5, i32 %index
@@ -270,7 +270,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword
-define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -305,7 +305,7 @@ entry:
 ; GCN: s_cbranch_execnz
 
 ; IDXMODE: s_set_gpr_idx_off
-define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
@@ -375,7 +375,7 @@ entry:
 
 ; GCN: buffer_store_dword [[MOVREL0]]
 ; GCN: buffer_store_dword [[MOVREL1]]
-define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -450,7 +450,7 @@ bb2:
 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
 
 ; GCN: buffer_store_dword [[INS0]]
-define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -499,7 +499,7 @@ bb2:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @extract_adjacent_blocks(i32 %arg) #0 {
+define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -549,7 +549,7 @@ bb7:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -610,7 +610,7 @@ bb7:                                              ; preds = %bb4, %bb1
 ; GCN: ds_write_b32
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
-define void @multi_same_block(i32 %arg) #0 {
+define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 bb:
   %tmp1 = add i32 %arg, -16
   %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
@@ -637,7 +637,7 @@ bb:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 3
@@ -658,7 +658,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 4
@@ -681,7 +681,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %idx.shl = shl i32 %idx.in, 2
@@ -702,7 +702,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
@@ -729,7 +729,7 @@ define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x f
 ; IDXMODE: s_set_gpr_idx_idx
 ; IDXMODE: v_mov_b32_e32
 ; GCN: s_cbranch_execnz [[REGLOOP]]
-define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
index a4acfa6d306..b6dea01cab4 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.barrier() #0
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
   %array = alloca [8 x double], align 8
   %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
@@ -51,7 +51,7 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x double>], align 16
   %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
@@ -77,7 +77,7 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca [8 x i64], align 8
   %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
@@ -109,7 +109,7 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %array = alloca [4 x <2 x i64>], align 16
   %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
index 990f33518ab..7cee8a41c12 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-define void @inf_loop_irreducible_cfg() nounwind {
+define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind {
 entry:
   br label %block
 
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e0b695934c..73482756b8c 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
-define void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index db1a0c67436..85eb163383e 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define void @inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) {
 entry:
   store i32 5, i32 addrspace(1)* %out
   call void asm sideeffect "s_endpgm", ""()
@@ -25,7 +25,7 @@ entry:
 ; Make sure inline assembly is treted as divergent.
 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; CHECK: s_and_saveexec_b64
-define void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
@@ -44,7 +44,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
   store i64 %sgpr, i64 addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64", ""()
   ret void
@@ -61,7 +61,7 @@ entry:
 ; All inlineasm instructions are assumed to be the maximum size
 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e32", ""()
   ret void
@@ -69,7 +69,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -80,7 +80,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -92,7 +92,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "", ""()
   ret void
@@ -100,7 +100,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment", ""()
   ret void
@@ -108,7 +108,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
 ; comment", ""()
@@ -117,7 +117,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
 ", ""()
@@ -126,7 +126,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment ; second comment", ""()
   ret void
@@ -134,7 +134,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment;second comment", ""()
   ret void
@@ -142,7 +142,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_nop_e64 ; inline comment
@@ -157,7 +157,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64 ; inline comment
 ; separate comment
@@ -171,7 +171,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
diff --git a/llvm/test/CodeGen/AMDGPU/inline-calls.ll b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
index 4541a902c1b..f8821f31989 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-calls.ll
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK: {{^}}kernel:
 ; CHECK-NOT: call
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; CHECK: {{^}}kernel2:
 ; CHECK-NOT: call
-define void @kernel2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) {
 entry:
   call void @kernel(i32 addrspace(1)* %out)
   ret void
@@ -31,7 +31,7 @@ entry:
 
 ; CHECK: {{^}}kernel3:
 ; CHECK-NOT: call
-define void @kernel3(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func_alias(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -43,7 +43,7 @@ entry:
 
 ; CHECK: {{^}}kernel4:
 ; CHECK-NOT: call
-define void @kernel4(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) {
 entry:
   call void @kernel_alias(i32 addrspace(1)* %out)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
index 1bcbd14009c..941a1b90dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -10,7 +10,7 @@
 ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 
-define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
 entry:
   %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
   %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
@@ -27,7 +27,7 @@ entry:
 ; GCN: s_mov_b32 m0, -1
 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; GCN: ; use [[COPY_M0]]
-define void @inline_sreg_constraint_m0() {
+define amdgpu_kernel void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
@@ -36,7 +36,7 @@ define void @inline_sreg_constraint_m0() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_i32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i32() {
   tail call void asm sideeffect "; use $0", "s"(i32 32)
   ret void
 }
@@ -44,7 +44,7 @@ define void @inline_sreg_constraint_imm_i32() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_f32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
   tail call void asm sideeffect "; use $0", "s"(float 1.0)
   ret void
 }
@@ -54,7 +54,7 @@ define void @inline_sreg_constraint_imm_f32() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_i64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
   tail call void asm sideeffect "; use $0", "s"(i64 -4)
   ret void
 }
@@ -63,7 +63,7 @@ define void @inline_sreg_constraint_imm_i64() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_f64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
   tail call void asm sideeffect "; use $0", "s"(double 1.0)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
index 75f3158937d..15e57fe6bff 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-16.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}s_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i16() #0 {
+define amdgpu_kernel void @s_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v) #0
   ret void
@@ -14,7 +14,7 @@ define void @s_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i16() #0 {
+define amdgpu_kernel void @v_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(i16 %v)
   ret void
@@ -23,7 +23,7 @@ define void @v_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}s_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_f16() #0 {
+define amdgpu_kernel void @s_input_output_f16() #0 {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -32,7 +32,7 @@ define void @s_input_output_f16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_f16() #0 {
+define amdgpu_kernel void @v_input_output_f16() #0 {
   %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(half %v)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 2eb21f07e0e..c1d67ba614c 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -3,7 +3,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i8() {
+define amdgpu_kernel void @s_input_output_i8() {
   %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i8 %v)
   ret void
@@ -11,7 +11,7 @@ define void @s_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i8() {
+define amdgpu_kernel void @v_input_output_i8() {
   %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(i8 %v)
   ret void
@@ -19,7 +19,7 @@ define void @v_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i128() {
+define amdgpu_kernel void @s_input_output_i128() {
   %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i128 %v)
   ret void
@@ -27,7 +27,7 @@ define void @s_input_output_i128() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v8f16() {
+define amdgpu_kernel void @s_input_output_v8f16() {
   %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<8 x half> %v)
   ret void
@@ -36,7 +36,7 @@ define void @s_input_output_v8f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_f16() {
+define amdgpu_kernel void @s_input_output_f16() {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -44,7 +44,7 @@ define void @s_input_output_f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2f16() {
+define amdgpu_kernel void @s_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
   ret void
@@ -52,7 +52,7 @@ define void @s_input_output_v2f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_v2f16() {
+define amdgpu_kernel void @v_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
   ret void
@@ -61,7 +61,7 @@ define void @v_input_output_v2f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_i16() {
+define amdgpu_kernel void @s_input_output_i16() {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v)
   ret void
@@ -69,14 +69,14 @@ define void @s_input_output_i16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2i16() {
+define amdgpu_kernel void @s_input_output_v2i16() {
   %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
   ret void
 }
 
 ; FIXME: Crash in codegen prepare
-; define void @s_input_output_i3() {
+; define amdgpu_kernel void @s_input_output_i3() {
 ;   %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"()
 ;   tail call void asm sideeffect "; use $0", "s"(i3 %v)
 ;   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
index 23a0ed3ca05..3c6c7e1d1b4 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0
   store i32 %val, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ entry:
 
 ; GCN-LABEL: {{^}}inline_asm_input_v2f16:
 ; GCN: s_mov_b32 s0, s{{[0-9]+}}
-define void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
 entry:
   %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0
   store i32 %val, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2i16:
 ; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
 entry:
   %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; GCN-LABEL: {{^}}inline_asm_output_v2f16:
 ; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
-define void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
 entry:
   %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0
   store <2 x half> %val, <2 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2i16:
 ; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
 entry:
   %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 
 ; GCN-LABEL: {{^}}inline_asm_packed_v2f16:
 ; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
+define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
 entry:
   %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0
   store <2 x half> %val, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
index 4a5e8869c2d..e895f27c886 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: test:
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca [16 x i32]
   %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 97ef5ce9039..76aad64e73a 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -18,56 +18,56 @@
 ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
-define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_1:
-define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_2:
-define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_3:
-define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4i32_0:
-define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_1:
-define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_2:
-define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_3:
-define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -86,7 +86,7 @@ define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -97,7 +97,7 @@ define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x fl
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x fl
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -117,7 +117,7 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -129,7 +129,7 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -138,7 +138,7 @@ define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -148,7 +148,7 @@ define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
   ret void
@@ -159,7 +159,7 @@ define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32>
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -169,7 +169,7 @@ define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -181,21 +181,21 @@ define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
-define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
-define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
   store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
   ret void
@@ -222,7 +222,7 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
 ; GCN: buffer_load_dwordx2
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
-define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GCN-TONGA: buffer_load_ushort
 
 ; GCN: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -265,7 +265,7 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -291,21 +291,21 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a
 ; GCN-TONGA: buffer_load_dword
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off
-define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
-define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
-define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
@@ -314,7 +314,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
 ; GCN-LABEL: {{^}}insert_split_bb:
-define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
@@ -361,7 +361,7 @@ endif:
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -374,14 +374,14 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
-define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
   ret void
@@ -411,7 +411,7 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64>
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 64
 
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -438,7 +438,7 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 128
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index b7945842e6e..a3f82b8a011 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -10,7 +10,7 @@
 
 ; GFX9-NOT: lshr
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
-define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
-define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 ; GFX9-DAG: ; use [[ELT1]]
-define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt1 = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@@ -68,7 +68,7 @@ define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %o
 ; GFX9-NOT: [[ELT0]]
 ; GFX9-NOT: [[VEC]]
 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
-define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -87,7 +87,7 @@ define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
 ; GFX9: ; use [[ELT1]]
-define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -112,7 +112,7 @@ define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
 ; GFX9: ; use [[ELT_HI]]
 ; GFX9: ; use [[VEC_HI]]
-define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
@@ -136,7 +136,7 @@ define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
 
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
-define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add
 
 ; GCN-NOT: shlr
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
-define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -166,7 +166,7 @@ define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16>
 
 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
-define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -181,7 +181,7 @@ define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a
 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
 
 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
-define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -197,7 +197,7 @@ define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -221,7 +221,7 @@ define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -244,7 +244,7 @@ define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -266,7 +266,7 @@ define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -283,7 +283,7 @@ define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -305,7 +305,7 @@ define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -325,7 +325,7 @@ define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -345,7 +345,7 @@ define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -362,7 +362,7 @@ define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -383,7 +383,7 @@ define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
   %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
@@ -399,7 +399,7 @@ define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -424,7 +424,7 @@ define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -451,7 +451,7 @@ define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
index e97cb1b5c34..1479303712d 100644
--- a/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/llvm/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -4,17 +4,17 @@
 # RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9
 
 --- |
-  define void @div_fmas() { ret void }
-  define void @s_getreg() { ret void }
-  define void @s_setreg() { ret void }
-  define void @vmem_gt_8dw_store() { ret void }
-  define void @readwrite_lane() { ret void }
-  define void @rfe() { ret void }
-  define void @s_mov_fed_b32() { ret void }
-  define void @s_movrel() { ret void }
-  define void @v_interp() { ret void }
-
-  define void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
+  define amdgpu_kernel void @div_fmas() { ret void }
+  define amdgpu_kernel void @s_getreg() { ret void }
+  define amdgpu_kernel void @s_setreg() { ret void }
+  define amdgpu_kernel void @vmem_gt_8dw_store() { ret void }
+  define amdgpu_kernel void @readwrite_lane() { ret void }
+  define amdgpu_kernel void @rfe() { ret void }
+  define amdgpu_kernel void @s_mov_fed_b32() { ret void }
+  define amdgpu_kernel void @s_movrel() { ret void }
+  define amdgpu_kernel void @v_interp() { ret void }
+
+  define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
   entry:
     %A.addr = alloca i32 addrspace(1)*, align 4
     store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/internalize.ll b/llvm/test/CodeGen/AMDGPU/internalize.ll
index 5a6669977b8..968b1d326a7 100644
--- a/llvm/test/CodeGen/AMDGPU/internalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/internalize.ll
@@ -8,14 +8,14 @@
 @gvar_used = addrspace(1) global i32 undef, align 4
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
 entry:
   store i32 1, i32 addrspace(1)* %out
   ret void
 }
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
 entry:
   store i32 %tid, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index c29434f5eca..31f2fbc919a 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %stof
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 45a061067cf..5cd965d2fa9 100644
--- a/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
-define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
@@ -22,7 +22,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
-define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
index 66182d09289..bc1dafe0ea1 100644
--- a/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
+++ b/llvm/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
 --- |
 
-  define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
index f9a0d0e9a32..37dd977ae21 100644
--- a/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/llvm/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
-define void @main1() #0 {
+define amdgpu_kernel void @main1() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(8)* null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
@@ -54,7 +54,7 @@ main_body:
 
 ; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
-define void @main2() #0 {
+define amdgpu_kernel void @main2() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(8)* null
   %tmp7 = extractelement <4 x float> %tmp, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index aaa63f7196b..8e358ef2804 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: {{^}}no_args:
 ; CHECK: ScratchSize: 5{{$}}
-define void @no_args() {
+define amdgpu_kernel void @no_args() {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -13,7 +13,7 @@ define void @no_args() {
 
 ; CHECK-LABEL: {{^}}force_align32:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align32(<8 x i32>) {
+define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -21,7 +21,7 @@ define void @force_align32(<8 x i32>) {
 
 ; CHECK-LABEL: {{^}}force_align64:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align64(<16 x i32>) {
+define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -29,7 +29,7 @@ define void @force_align64(<16 x i32>) {
 
 ; CHECK-LABEL: {{^}}force_align128:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align128(<32 x i32>) {
+define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
@@ -37,7 +37,7 @@ define void @force_align128(<32 x i32>) {
 
 ; CHECK-LABEL: {{^}}force_align256:
 ; CHECK: ScratchSize: 5{{$}}
-define void @force_align256(<64 x i32>) {
+define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 95a68319f8a..6fa26cb3879 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -17,7 +17,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
   %0 = sext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -94,7 +94,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -113,7 +113,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
   %0 = sext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -126,7 +126,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
-define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
@@ -138,7 +138,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
-define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
@@ -152,7 +152,7 @@ entry:
 ; MESA-GCN: buffer_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
@@ -166,7 +166,7 @@ entry:
 ; MESA-GCN: buffer_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
@@ -192,7 +192,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
-define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
@@ -209,7 +209,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
@@ -253,7 +253,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
@@ -273,7 +273,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
@@ -293,7 +293,7 @@ entry:
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
-define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -323,7 +323,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
@@ -354,7 +354,7 @@ entry:
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
-define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
@@ -405,7 +405,7 @@ entry:
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
-define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
@@ -422,7 +422,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
-define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
@@ -478,7 +478,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
@@ -534,7 +534,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
@@ -561,7 +561,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
@@ -588,7 +588,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
@@ -599,7 +599,7 @@ entry:
 ; MESA-GCN: s_load_dwordx2
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -611,7 +611,7 @@ define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
   ret void
@@ -621,7 +621,7 @@ entry:
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
-; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
@@ -631,7 +631,7 @@ entry:
 ; SI: v_and_b32_e32
 ; SI: buffer_store_byte
 ; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -640,7 +640,7 @@ define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -650,7 +650,7 @@ define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -660,7 +660,7 @@ define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
@@ -672,7 +672,7 @@ define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: v_ashrrev_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
index d2b5ad44ab3..4af37d8da96 100644
--- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -48,7 +48,7 @@
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
diff --git a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
index 9975b1b7f5c..c46d68e38ad 100644
--- a/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -4,7 +4,7 @@
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
-define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
diff --git a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
index 93b33736b2a..13dd7058c50 100644
--- a/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -4,7 +4,7 @@
 ; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 ; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -26,7 +26,7 @@ entry:
 
 ; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -48,7 +48,7 @@ entry:
 
 ; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -71,7 +71,7 @@ entry:
 ; ALL-LABEL: @occupancy_0(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -94,7 +94,7 @@ entry:
 ; ALL-LABEL: @occupancy_max(
 ; CI-NOT: alloca [5 x i32]
 ; SI: alloca [5 x i32]
-define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -118,7 +118,7 @@ entry:
 ; CI-LABEL: @occupancy_6(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [42 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -142,7 +142,7 @@ entry:
 
 ; ALL-LABEL: @occupancy_6_over(
 ; ALL: alloca [43 x i8]
-define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [43 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -168,7 +168,7 @@ entry:
 ; CI-LABEL: @occupancy_8(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [32 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -192,7 +192,7 @@ entry:
 
 ; ALL-LABEL: @occupancy_8_over(
 ; ALL: alloca [33 x i8]
-define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [33 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -218,7 +218,7 @@ entry:
 ; CI-LABEL: @occupancy_9(
 ; SI: alloca
 ; CI-NOT: alloca
-define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [28 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -242,7 +242,7 @@ entry:
 
 ; ALL-LABEL: @occupancy_9_over(
 ; ALL: alloca [29 x i8]
-define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [29 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
index 99334585e58..c23dea2b6b7 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -15,7 +15,7 @@ declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; HSA-LABEL: {{^}}test_no_round_size_1:
 ; HSA: workgroup_group_segment_byte_size = 38
-define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -34,7 +34,7 @@ define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #
 ; HSA-LABEL: {{^}}test_round_size_2:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -50,7 +50,7 @@ define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -65,7 +65,7 @@ define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
 ; HSA: workgroup_group_segment_byte_size = 38
 ; HSA: group_segment_alignment = 4
-define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
 
@@ -78,7 +78,7 @@ define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
   ret void
@@ -88,7 +88,7 @@ define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
   ret void
@@ -98,7 +98,7 @@ define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
 ; HSA: workgroup_group_segment_byte_size = 212
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
@@ -114,7 +114,7 @@ define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
@@ -142,7 +142,7 @@ define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -163,7 +163,7 @@ define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -184,7 +184,7 @@ define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
 ; HSA: workgroup_group_segment_byte_size = 150
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -205,7 +205,7 @@ define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
 ; HSA: workgroup_group_segment_byte_size = 118
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -226,7 +226,7 @@ define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
 ; HSA: workgroup_group_segment_byte_size = 142
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
@@ -247,7 +247,7 @@ define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
 ; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
index 9875814b03d..254673d8a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
-define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 078d6330ce0..1b3eeed3005 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -18,7 +18,7 @@
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
-define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index 6ff6fc3d7af..fff2a920072 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -10,7 +10,7 @@
 ; reads and writes are bundled together in the same instruction.
 
 ; CHECK: {{^}}lds_crash:
-define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
diff --git a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
index abe472e423f..8b7e9e6d6aa 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -10,7 +10,7 @@
 
 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
-define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
@@ -88,7 +88,7 @@ declare void @llvm.r600.group.barrier() nounwind convergent
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
   %1 = load i32, i32 addrspace(3)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-size.ll b/llvm/test/CodeGen/AMDGPU/lds-size.ll
index 1607713090e..c65817abd48 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-size.ll
@@ -14,7 +14,7 @@
 ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index cb5d73fb0d8..53c1c727a19 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
-define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
index 4244c48d240..e85a1b690af 100644
--- a/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -11,7 +11,7 @@
 ; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
-define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/literals.ll b/llvm/test/CodeGen/AMDGPU/literals.ll
index 82fbb7f4618..1c546ba9f74 100644
--- a/llvm/test/CodeGen/AMDGPU/literals.ll
+++ b/llvm/test/CodeGen/AMDGPU/literals.ll
@@ -10,7 +10,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
-define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 5, %in
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
-define void @float_literal(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fadd float 5.0, %in
   store float %0, float addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
 
-define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ entry:
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define void @inline_literal_dot4(float addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
   %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store float %0, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/liveness.mir b/llvm/test/CodeGen/AMDGPU/liveness.mir
index 112c3f8e69a..48762e3f2ab 100644
--- a/llvm/test/CodeGen/AMDGPU/liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/liveness.mir
@@ -8,7 +8,7 @@
 # Should see three distinct value numbers:
 # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
 --- |
-  define void @test0() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
 ...
 ---
 name: test0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
index 77dd4b13498..64f0d65463b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 ; SI: v_bfe_i32
 ; EG: BFE_INT
 ; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@ define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
 ; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) n
 ; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) n
 ; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
 ; SI: v_bfe_i32
 ; EG: BFE_INT
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
 
 ; FUNC-LABEL: {{^}}v_bfe_print_arg:
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
   %load = load i32, i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -54,7 +54,7 @@ define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) no
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -64,7 +64,7 @@ define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -74,7 +74,7 @@ define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
@@ -88,7 +88,7 @@ define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
@@ -100,7 +100,7 @@ define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: buffer_load_dword
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; SI: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -113,7 +113,7 @@ define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -125,7 +125,7 @@ define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -137,7 +137,7 @@ define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -149,7 +149,7 @@ define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -160,7 +160,7 @@ define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -171,7 +171,7 @@ define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: lshr
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
@@ -184,7 +184,7 @@ define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -196,7 +196,7 @@ define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -208,7 +208,7 @@ define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -220,7 +220,7 @@ define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -232,7 +232,7 @@ define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -244,7 +244,7 @@ define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -256,7 +256,7 @@ define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -268,7 +268,7 @@ define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -280,7 +280,7 @@ define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -292,7 +292,7 @@ define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -304,7 +304,7 @@ define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -316,7 +316,7 @@ define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@ define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -340,7 +340,7 @@ define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -352,7 +352,7 @@ define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -364,7 +364,7 @@ define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -376,7 +376,7 @@ define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -388,7 +388,7 @@ define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -400,7 +400,7 @@ define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -412,7 +412,7 @@ define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
 ; SI-NOT: v_ashr
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
 ; SI: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
@@ -428,7 +428,7 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; SI: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %src = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
   %div = sdiv i32 %bfe, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
index ee47b14c496..8cf1e16c96d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
 ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@ define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
 ; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) n
 ; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) n
 ; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
 ; SI: v_bfe_u32
 ; EG: BFE_UINT
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) n
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -55,7 +55,7 @@ define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -65,7 +65,7 @@ define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; SI: buffer_load_ubyte
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
@@ -82,7 +82,7 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) n
 ; VI-NEXT: v_and_b32_e32
 ; SI-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -97,7 +97,7 @@ define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; SI-NEXT: v_and_b32_e32
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -111,7 +111,7 @@ define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_add_i32
 ; SI: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -126,7 +126,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspa
 ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -141,7 +141,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspa
 ; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -155,7 +155,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspa
 ; SI: v_add_i32
 ; SI-NEXT: bfe
 ; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -169,14 +169,14 @@ define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrsp
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI: s_endpgm
 ; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
@@ -184,7 +184,7 @@ define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
   ret void
 }
 
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
@@ -199,7 +199,7 @@ define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
@@ -214,7 +214,7 @@ define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-NOT: shr
 ; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
 ; SI: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -227,7 +227,7 @@ define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
@@ -239,7 +239,7 @@ define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
@@ -252,7 +252,7 @@ define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -265,7 +265,7 @@ define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -277,7 +277,7 @@ define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -289,7 +289,7 @@ define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -301,7 +301,7 @@ define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -312,7 +312,7 @@ define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -323,7 +323,7 @@ define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI-NOT: lshr
 ; SI-NOT: {{[^@]}}bfe
 ; SI: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
@@ -336,7 +336,7 @@ define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) noun
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -348,7 +348,7 @@ define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -360,7 +360,7 @@ define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -372,7 +372,7 @@ define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -384,7 +384,7 @@ define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -396,7 +396,7 @@ define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -408,7 +408,7 @@ define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -420,7 +420,7 @@ define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -432,7 +432,7 @@ define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -444,7 +444,7 @@ define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -456,7 +456,7 @@ define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -468,7 +468,7 @@ define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -480,7 +480,7 @@ define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -492,7 +492,7 @@ define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -504,7 +504,7 @@ define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -516,7 +516,7 @@ define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -528,7 +528,7 @@ define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -540,7 +540,7 @@ define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -552,7 +552,7 @@ define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
 ; SI: buffer_store_dword [[VREG]],
 ; SI: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
   %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -569,7 +569,7 @@ define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
 ; SI-DAG: buffer_store_dword [[AND]]
 ; SI-DAG: buffer_store_dword [[BFE]]
 ; SI: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) nounwind {
   %src = load i32, i32 addrspace(1)* %in, align 4
@@ -583,7 +583,7 @@ define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; FUNC-LABEL: {{^}}lshr_and:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = lshr i32 %a, 6
   %c = and i32 %b, 7
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -593,7 +593,7 @@ define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; FUNC-LABEL: {{^}}v_lshr_and:
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
 ; SI: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
   store i32 %d, i32 addrspace(1)* %out, align 8
@@ -603,7 +603,7 @@ define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; FUNC-LABEL: {{^}}and_lshr:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = and i32 %a, 448
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -613,7 +613,7 @@ define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; FUNC-LABEL: {{^}}and_lshr2:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; SI: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = and i32 %a, 511
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -623,7 +623,7 @@ define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; FUNC-LABEL: {{^}}shl_lshr:
 ; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
 ; SI: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
   %b = shl i32 %a, 9
   %c = lshr i32 %b, 11
   store i32 %c, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
index 23a32dcfd94..2777dd12a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.export.ll
@@ -6,7 +6,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float
 ; GCN-LABEL: {{^}}test_export_zeroes:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes() #0 {
+define amdgpu_kernel void @test_export_zeroes() #0 {
 
   call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
   call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
@@ -21,7 +21,7 @@ define void @test_export_zeroes() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0() #0 {
+define amdgpu_kernel void @test_export_en_src0() #0 {
   call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -32,7 +32,7 @@ define void @test_export_en_src0() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1() #0 {
+define amdgpu_kernel void @test_export_en_src1() #0 {
   call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -43,7 +43,7 @@ define void @test_export_en_src1() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2() #0 {
+define amdgpu_kernel void @test_export_en_src2() #0 {
   call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -54,7 +54,7 @@ define void @test_export_en_src2() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3() #0 {
+define amdgpu_kernel void @test_export_en_src3() #0 {
   call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -65,7 +65,7 @@ define void @test_export_en_src3() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1() #0 {
   call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -76,7 +76,7 @@ define void @test_export_en_src0_src1() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2() #0 {
   call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
 }
@@ -88,7 +88,7 @@ define void @test_export_en_src0_src2() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3() #0 {
   call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -101,7 +101,7 @@ define void @test_export_en_src0_src3() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -111,7 +111,7 @@ define void @test_export_en_src0_src1_src2_src3() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7() #0 {
+define amdgpu_kernel void @test_export_mrt7() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
   ret void
@@ -124,7 +124,7 @@ define void @test_export_mrt7() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z() #0 {
+define amdgpu_kernel void @test_export_z() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -137,7 +137,7 @@ define void @test_export_z() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null() #0 {
+define amdgpu_kernel void @test_export_null() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -150,7 +150,7 @@ define void @test_export_null() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10() #0 {
+define amdgpu_kernel void @test_export_reserved10() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -163,7 +163,7 @@ define void @test_export_reserved10() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11() #0 {
+define amdgpu_kernel void @test_export_reserved11() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -176,7 +176,7 @@ define void @test_export_reserved11() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0() #0 {
+define amdgpu_kernel void @test_export_pos0() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -189,7 +189,7 @@ define void @test_export_pos0() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3() #0 {
+define amdgpu_kernel void @test_export_pos3() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -202,7 +202,7 @@ define void @test_export_pos3() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0() #0 {
+define amdgpu_kernel void @test_export_param0() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -215,7 +215,7 @@ define void @test_export_param0() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31() #0 {
+define amdgpu_kernel void @test_export_param31() #0 {
   call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
@@ -228,7 +228,7 @@ define void @test_export_param31() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm() #0 {
+define amdgpu_kernel void @test_export_vm() #0 {
   call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 9c845e84bc1..4a620e0d191 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   ret void
 }
@@ -43,7 +43,7 @@ define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   ret void
@@ -52,7 +52,7 @@ define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@ define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -70,7 +70,7 @@ define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrsp
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   ret void
 }
@@ -78,7 +78,7 @@ define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   ret void
@@ -88,7 +88,7 @@ define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
@@ -102,7 +102,7 @@ define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
@@ -113,7 +113,7 @@ define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   store i32 %result, i32 addrspace(4)* %out
   ret void
@@ -122,7 +122,7 @@ define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   store i32 %result, i32 addrspace(4)* %out
@@ -131,7 +131,7 @@ define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspac
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   ret void
 }
@@ -139,7 +139,7 @@ define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   ret void
@@ -148,7 +148,7 @@ define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
@@ -161,7 +161,7 @@ define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
@@ -173,7 +173,7 @@ define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   store i64 %result, i64 addrspace(4)* %out
   ret void
@@ -183,7 +183,7 @@ define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   store i64 %result, i64 addrspace(4)* %out
@@ -194,7 +194,7 @@ define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   ret void
 }
@@ -203,7 +203,7 @@ define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   ret void
@@ -213,7 +213,7 @@ define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
@@ -227,7 +227,7 @@ define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
@@ -240,7 +240,7 @@ define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0
 ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
-define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -254,7 +254,7 @@ define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   ret void
 }
@@ -284,7 +284,7 @@ define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   ret void
@@ -294,7 +294,7 @@ define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -304,7 +304,7 @@ define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -315,7 +315,7 @@ define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   ret void
 }
@@ -324,7 +324,7 @@ define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   ret void
@@ -335,7 +335,7 @@ define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
@@ -350,7 +350,7 @@ define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
@@ -363,7 +363,7 @@ define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 22097418eec..a4e3dc337b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
   ret void
 }
@@ -43,7 +43,7 @@ define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
   ret void
@@ -52,7 +52,7 @@ define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@ define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   store i32 %result, i32 addrspace(1)* %out
@@ -70,7 +70,7 @@ define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrsp
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
   ret void
 }
@@ -78,7 +78,7 @@ define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
   ret void
@@ -88,7 +88,7 @@ define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
@@ -102,7 +102,7 @@ define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
@@ -115,7 +115,7 @@ define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -129,7 +129,7 @@ define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
   ret void
 }
@@ -159,7 +159,7 @@ define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
   ret void
@@ -169,7 +169,7 @@ define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@ define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   store i64 %result, i64 addrspace(1)* %out
@@ -190,7 +190,7 @@ define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
   ret void
 }
@@ -199,7 +199,7 @@ define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
   ret void
@@ -210,7 +210,7 @@ define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
@@ -225,7 +225,7 @@ define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
@@ -236,7 +236,7 @@ define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   store i32 %result, i32 addrspace(4)* %out
   ret void
@@ -245,7 +245,7 @@ define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   store i32 %result, i32 addrspace(4)* %out
@@ -254,7 +254,7 @@ define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspac
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
   ret void
 }
@@ -262,7 +262,7 @@ define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
   %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
   ret void
@@ -271,7 +271,7 @@ define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
@@ -284,7 +284,7 @@ define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
@@ -297,7 +297,7 @@ define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
@@ -320,7 +320,7 @@ attributes #2 = { nounwind argmemonly }
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   store i64 %result, i64 addrspace(4)* %out
   ret void
@@ -330,7 +330,7 @@ define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   store i64 %result, i64 addrspace(4)* %out
@@ -341,7 +341,7 @@ define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
   ret void
 }
@@ -350,7 +350,7 @@ define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
   %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
   ret void
@@ -360,7 +360,7 @@ define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
@@ -374,7 +374,7 @@ define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
index 6d9db65e7d9..10bea8ea63b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1() #0
 ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_buffer_wbinvl1() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
index 746298465e5..fe60d16d90f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
@@ -6,7 +6,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
 ; SI-NEXT: ; BB#0:
 ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; SI-NEXT: s_endpgm
-define void @test_buffer_wbinvl1_sc() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.sc()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
index 4e0f3c37f21..061c1469ed4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
 ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 ; GCN: s_endpgm
-define void @test_buffer_wbinvl1_vol() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.vol()
 ; This used to crash in hazard recognizer
   store i8 0, i8 addrspace(1)* undef, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 3798c46677f..f08d4b6c791 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -10,7 +10,7 @@ declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @class_f16(
+define amdgpu_kernel void @class_f16(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs(
+define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -51,7 +51,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fneg(
+define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -71,7 +71,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs_fneg(
+define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -91,7 +91,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_1(
+define amdgpu_kernel void @class_f16_1(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -108,7 +108,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_64(
+define amdgpu_kernel void @class_f16_64(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -126,7 +126,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_full_mask(
+define amdgpu_kernel void @class_f16_full_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_nine_bit_mask(
+define amdgpu_kernel void @class_f16_nine_bit_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 668c669e41e..1fcdac537fb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -14,7 +14,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -29,7 +29,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -45,7 +45,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -61,7 +61,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -76,7 +76,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b)
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -104,7 +104,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -118,7 +118,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -132,7 +132,7 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -150,7 +150,7 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -170,7 +170,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -190,7 +190,7 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -205,7 +205,7 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -221,7 +221,7 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -237,7 +237,7 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -249,7 +249,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b)
 ; SI-LABEL: {{^}}test_class_1_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
-define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -259,7 +259,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-LABEL: {{^}}test_class_64_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
-define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -275,7 +275,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -290,7 +290,7 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -321,7 +321,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -358,7 +358,7 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -381,7 +381,7 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
 ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -436,7 +436,7 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -456,7 +456,7 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
 ; SI: s_or_b64
 ; SI: s_endpgm
-define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -476,7 +476,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -488,7 +488,7 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -500,7 +500,7 @@ define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-NOT: v_cmp_class
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
 ; SI: buffer_store_dword
-define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 410ac59279a..05438860729 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.cos.f16(half %a)
 ; VI:  v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index f6495d8155f..5b9c83c11cf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cos.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_cos_f32:
 ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
   %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
   store float %cos, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index 22bed45ee30..dadb070bdcf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubeid:
 ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 565f22c5d5b..60c4618a011 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubema:
 ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index a3ba3274581..10669cf9913 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubesc:
 ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index d3c0f2851ea..b2770308c17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubetc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubetc:
 ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
index 24a897521f7..b92eb34750d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -8,7 +8,7 @@
 ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
-define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, floa
 ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
-define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -29,7 +29,7 @@ define void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float
 ; SI-NEXT: s_endpgm
 ; VI-NEXT: s_endpgm
 ; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
   %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
-define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -56,7 +56,7 @@ define void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspac
 ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
-define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -71,7 +71,7 @@ define void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
 ; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
-define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -86,7 +86,7 @@ define void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -104,7 +104,7 @@ define void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -122,7 +122,7 @@ define void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -141,7 +141,7 @@ define void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, flo
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
-define void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 6c09aa59244..58250de2f89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @dispatch_id(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 2e8625256f1..92208e7fe17 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 6d262cf497a..e04d9e662ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -9,7 +9,7 @@ declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16(
+define amdgpu_kernel void @div_fixup_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -30,7 +30,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a(
+define amdgpu_kernel void @div_fixup_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -68,7 +68,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -86,7 +86,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
@@ -102,7 +102,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -118,7 +118,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index cc1504f2bc8..b8fcacf46bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -16,7 +16,7 @@ declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readn
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl
 
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
-define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index d408fe9f87f..a86468b07a2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -20,7 +20,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -34,7 +34,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -48,7 +48,7 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -62,7 +62,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -70,7 +70,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a,
 
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
 ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
 ; SI: s_mov_b64 vcc, 0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -98,7 +98,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
 ; SI: s_mov_b64 vcc, -1
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -114,7 +114,7 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
-define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -150,7 +150,7 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 8e5c62c31db..0b4f09ac651 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -31,7 +31,7 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -51,7 +51,7 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -71,7 +71,7 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -91,7 +91,7 @@ define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -109,7 +109,7 @@ define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -127,7 +127,7 @@ define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -145,7 +145,7 @@ define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -163,7 +163,7 @@ define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -181,7 +181,7 @@ define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -199,7 +199,7 @@ define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -217,7 +217,7 @@ define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -236,7 +236,7 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -250,7 +250,7 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -265,7 +265,7 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -280,7 +280,7 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -292,7 +292,7 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -308,7 +308,7 @@ define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -326,7 +326,7 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -349,7 +349,7 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 92d3fc8b107..4d4e8d45ef5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
 ; FUNC-LABEL: {{^}}ds_bpermute:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -14,7 +14,7 @@ define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
@@ -24,7 +24,7 @@ define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 6d9c9419153..1766e288331 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
 ; CHECK-LABEL: {{^}}ds_permute:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -14,7 +14,7 @@ define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
 ; CHECK-LABEL: {{^}}ds_permute_imm_offset:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index ef3cb00024b..a3a78d326a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
 ; FUNC-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
index a8f6579acbf..b972ddb8cb7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false)
   ret void
@@ -18,7 +18,7 @@ define void @test_export_compr_zeroes_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -27,7 +27,7 @@ define void @test_export_compr_en_src0_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src1_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -36,7 +36,7 @@ define void @test_export_compr_en_src1_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -45,7 +45,7 @@ define void @test_export_compr_en_src0_src1_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}}
-define void @test_export_compr_en_invalid2_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -54,7 +54,7 @@ define void @test_export_compr_en_invalid2_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_invalid10_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
@@ -63,7 +63,7 @@ define void @test_export_compr_en_invalid10_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}}
-define void @test_export_compr_mrt7_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 true, i1 false)
   ret void
@@ -74,7 +74,7 @@ define void @test_export_compr_mrt7_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_z_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_z_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
@@ -85,7 +85,7 @@ define void @test_export_compr_z_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
-define void @test_export_compr_vm_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 true)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 true)
   ret void
@@ -94,7 +94,7 @@ define void @test_export_compr_vm_v2f16() #0 {
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false)
   ret void
@@ -104,7 +104,7 @@ define void @test_export_compr_zeroes_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -113,7 +113,7 @@ define void @test_export_compr_en_src0_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src1_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -122,7 +122,7 @@ define void @test_export_compr_en_src1_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -131,7 +131,7 @@ define void @test_export_compr_en_src0_src1_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}}
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}}
-define void @test_export_compr_mrt7_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 true, i1 false)
   ret void
@@ -142,7 +142,7 @@ define void @test_export_compr_mrt7_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
-define void @test_export_compr_z_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_z_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
@@ -153,7 +153,7 @@ define void @test_export_compr_z_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
-define void @test_export_compr_vm_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 true)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 true)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 9b5836fa56a..6d2de108829 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 ; GCN-LABEL: {{^}}test_export_zeroes_f32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_f32() #0 {
+define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
@@ -22,7 +22,7 @@ define void @test_export_zeroes_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -33,7 +33,7 @@ define void @test_export_en_src0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -44,7 +44,7 @@ define void @test_export_en_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -55,7 +55,7 @@ define void @test_export_en_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -66,7 +66,7 @@ define void @test_export_en_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -77,7 +77,7 @@ define void @test_export_en_src0_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -89,7 +89,7 @@ define void @test_export_en_src0_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -102,7 +102,7 @@ define void @test_export_en_src0_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -112,7 +112,7 @@ define void @test_export_en_src0_src1_src2_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_f32() #0 {
+define amdgpu_kernel void @test_export_mrt7_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
   ret void
@@ -125,7 +125,7 @@ define void @test_export_mrt7_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_f32() #0 {
+define amdgpu_kernel void @test_export_z_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -138,7 +138,7 @@ define void @test_export_z_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_f32() #0 {
+define amdgpu_kernel void @test_export_null_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -151,7 +151,7 @@ define void @test_export_null_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_f32() #0 {
+define amdgpu_kernel void @test_export_reserved10_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -164,7 +164,7 @@ define void @test_export_reserved10_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_f32() #0 {
+define amdgpu_kernel void @test_export_reserved11_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -177,7 +177,7 @@ define void @test_export_reserved11_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_f32() #0 {
+define amdgpu_kernel void @test_export_pos0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -190,7 +190,7 @@ define void @test_export_pos0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_f32() #0 {
+define amdgpu_kernel void @test_export_pos3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -203,7 +203,7 @@ define void @test_export_pos3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_f32() #0 {
+define amdgpu_kernel void @test_export_param0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -216,7 +216,7 @@ define void @test_export_param0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_f32() #0 {
+define amdgpu_kernel void @test_export_param31_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -229,7 +229,7 @@ define void @test_export_param31_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_f32() #0 {
+define amdgpu_kernel void @test_export_vm_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
   ret void
@@ -252,7 +252,7 @@ define void @test_export_vm_f32() #0 {
 ; GCN-LABEL: {{^}}test_export_zeroes_i32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_i32() #0 {
+define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false)
@@ -267,7 +267,7 @@ define void @test_export_zeroes_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -278,7 +278,7 @@ define void @test_export_en_src0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -289,7 +289,7 @@ define void @test_export_en_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -300,7 +300,7 @@ define void @test_export_en_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -311,7 +311,7 @@ define void @test_export_en_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -322,7 +322,7 @@ define void @test_export_en_src0_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -334,7 +334,7 @@ define void @test_export_en_src0_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -347,7 +347,7 @@ define void @test_export_en_src0_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -357,7 +357,7 @@ define void @test_export_en_src0_src1_src2_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_i32() #0 {
+define amdgpu_kernel void @test_export_mrt7_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
   ret void
@@ -370,7 +370,7 @@ define void @test_export_mrt7_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_i32() #0 {
+define amdgpu_kernel void @test_export_z_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -383,7 +383,7 @@ define void @test_export_z_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_i32() #0 {
+define amdgpu_kernel void @test_export_null_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -396,7 +396,7 @@ define void @test_export_null_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_i32() #0 {
+define amdgpu_kernel void @test_export_reserved10_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -409,7 +409,7 @@ define void @test_export_reserved10_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_i32() #0 {
+define amdgpu_kernel void @test_export_reserved11_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -422,7 +422,7 @@ define void @test_export_reserved11_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_i32() #0 {
+define amdgpu_kernel void @test_export_pos0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -435,7 +435,7 @@ define void @test_export_pos0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_i32() #0 {
+define amdgpu_kernel void @test_export_pos3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -448,7 +448,7 @@ define void @test_export_pos3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_i32() #0 {
+define amdgpu_kernel void @test_export_param0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -461,7 +461,7 @@ define void @test_export_param0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_i32() #0 {
+define amdgpu_kernel void @test_export_param31_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -474,7 +474,7 @@ define void @test_export_param31_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_i32() #0 {
+define amdgpu_kernel void @test_export_vm_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
index 42a252d35d2..c9993ee8836 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
@@ -7,7 +7,7 @@ declare float @llvm.fabs.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc:
 ; GCN: s_endpgm
-define void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
+define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %s
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
   store i64 %result, i64 addrspace(1)* %out
@@ -24,7 +24,7 @@ define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
-define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
@@ -34,7 +34,7 @@ define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, floa
 
 ; GCN-LABEL: {{^}}v_fcmp:
 ; GCN-NOT: v_cmp_eq_f32_e64
-define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@ define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq:
 ; GCN: v_cmp_eq_f32_e64
-define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_one:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ogt:
 ; GCN: v_cmp_gt_f32_e64
-define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oge:
 ; GCN: v_cmp_ge_f32_e64
-define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@ define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_olt:
 ; GCN: v_cmp_lt_f32_e64
-define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ole:
 ; GCN: v_cmp_le_f32_e64
-define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@ define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ueq:
 ; GCN: v_cmp_nlg_f32_e64
-define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_une:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -107,7 +107,7 @@ define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ugt:
 ; GCN: v_cmp_nle_f32_e64
-define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -115,7 +115,7 @@ define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_uge:
 ; GCN: v_cmp_nlt_f32_e64
-define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -123,7 +123,7 @@ define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ult:
 ; GCN: v_cmp_nge_f32_e64
-define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -131,7 +131,7 @@ define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ule:
 ; GCN: v_cmp_ngt_f32_e64
-define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oeq:
 ; GCN: v_cmp_eq_f64_e64
-define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@ define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_one:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -155,7 +155,7 @@ define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ogt:
 ; GCN: v_cmp_gt_f64_e64
-define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@ define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oge:
 ; GCN: v_cmp_ge_f64_e64
-define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -171,7 +171,7 @@ define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_olt:
 ; GCN: v_cmp_lt_f64_e64
-define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@ define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ole:
 ; GCN: v_cmp_le_f64_e64
-define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ueq:
 ; GCN: v_cmp_nlg_f64_e64
-define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -195,7 +195,7 @@ define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_une:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@ define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ugt:
 ; GCN: v_cmp_nle_f64_e64
-define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -211,7 +211,7 @@ define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_uge:
 ; GCN: v_cmp_nlt_f64_e64
-define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -219,7 +219,7 @@ define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ult:
 ; GCN: v_cmp_nge_f64_e64
-define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@ define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ule:
 ; GCN: v_cmp_ngt_f64_e64
-define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 54d7848da3b..248ee9904da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -8,7 +8,7 @@ declare float @llvm.amdgcn.fdiv.fast(float, float) #0
 ; CHECK: v_rcp_f32_e32
 ; CHECK: v_mul_f32_e32
 ; CHECK: v_mul_f32_e32
-define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
   %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
   store float %fdiv, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
index 1c22aa1cb7a..a4ae37b23c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
@@ -16,7 +16,7 @@ define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.ar
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
 ; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
   %src0.f16 = trunc i32 %src0.arg to i16
   %src0 = bitcast i16 %src0.f16 to half
   %src1.f16 = trunc i32 %src1.arg to i16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
index 010599d3b29..230e625ad45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}test_fmed3:
 ; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
   %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
   store float %mad, float addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@ define void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, floa
 
 ; GCN-LABEL: {{^}}test_fmed3_srcmods:
 ; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
-define void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
   %src0.fneg = fsub float -0.0, %src0
   %src1.fabs = call float @llvm.fabs.f32(float %src1)
   %src2.fabs = call float @llvm.fabs.f32(float %src2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index d5c1c0a0969..b47d2dbc744 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -12,7 +12,7 @@ define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -20,7 +20,7 @@ define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@ define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
-define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs)
@@ -40,7 +40,7 @@ define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float
 ; GCN-LABEL: {{^}}test_mad_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_f32_e32
-define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
+define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
   store float %add, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index d8c1af036a3..026f6901fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fract.f16(half %a)
 ; VI:  v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fract_f16(
+define amdgpu_kernel void @fract_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index 53cb607d54f..d4f1c5fd9be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.fract.f64(double) #0
 
 ; GCN-LABEL: {{^}}v_fract_f32:
 ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float %src)
   store float %fract, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}v_fract_f64:
 ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
   %fract = call double @llvm.amdgcn.fract.f64(double %src)
   store double %fract, double addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
 ; GCN-LABEL: {{^}}v_fract_undef_f32:
 ; GCN-NOT: v_fract_f32
 ; GCN-NOT: store_dword
-define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float undef)
   store float %fract, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 7521224058f..dc3eb4ce191 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -6,7 +6,7 @@ declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a)
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_I16]]
-define void @frexp_exp_f16(
+define amdgpu_kernel void @frexp_exp_f16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -21,7 +21,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}}
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_sext(
+define amdgpu_kernel void @frexp_exp_f16_sext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]]
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_zext(
+define amdgpu_kernel void @frexp_exp_f16_zext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 9c49f175f2b..0d686147caf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.frexp.exp.i32.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src)
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 706537d7e21..722cd44e99f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.frexp.mant.f16(half %a)
 ; VI:  v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @frexp_mant_f16(
+define amdgpu_kernel void @frexp_mant_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index b8d63defffe..605dc3db2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -8,7 +8,7 @@ declare double @llvm.amdgcn.frexp.mant.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
   store float %frexp.mant, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
   store float %frexp.mant, float addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %sr
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
   store double %frexp.mant, double addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
   store double %frexp.mant, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src)
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index 6014e2ed85f..d26fab4cebe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
-define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -23,7 +23,7 @@ define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
-define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
   store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
@@ -51,7 +51,7 @@ endif:                                            ; preds = %else, %if
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
-define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
index 987a41bc1f9..aa04af7a64a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
@@ -7,7 +7,7 @@ declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0
 ; No crash on invalid input
 ; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
 ; GCN: s_endpgm
-define void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
+define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_eq:
 ; GCN: v_cmp_eq_u32_e64
-define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -23,14 +23,14 @@ define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp:
 ; GCN-NOT: v_cmp_eq_u32_e64
-define void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_ne:
 ; GCN: v_cmp_ne_u32_e64
-define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ugt:
 ; GCN: v_cmp_gt_u32_e64
-define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_uge:
 ; GCN: v_cmp_ge_u32_e64
-define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ult:
 ; GCN: v_cmp_lt_u32_e64
-define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@ define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ule:
 ; GCN: v_cmp_le_u32_e64
-define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@ define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sgt:
 ; GCN: v_cmp_gt_i32_e64
-define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -78,7 +78,7 @@ define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sge:
 ; GCN: v_cmp_ge_i32_e64
-define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -86,14 +86,14 @@ define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_slt:
 ; GCN: v_cmp_lt_i32_e64
-define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_sle:
 ; GCN: v_cmp_le_i32_e64
-define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -101,7 +101,7 @@ define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_eq:
 ; GCN: v_cmp_eq_u64_e64
-define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -109,7 +109,7 @@ define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_ne:
 ; GCN: v_cmp_ne_u64_e64
-define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -117,7 +117,7 @@ define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ugt:
 ; GCN: v_cmp_gt_u64_e64
-define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_uge:
 ; GCN: v_cmp_ge_u64_e64
-define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ult:
 ; GCN: v_cmp_lt_u64_e64
-define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -141,7 +141,7 @@ define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ule:
 ; GCN: v_cmp_le_u64_e64
-define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -149,7 +149,7 @@ define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sgt:
 ; GCN: v_cmp_gt_i64_e64
-define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@ define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sge:
 ; GCN: v_cmp_ge_i64_e64
-define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -165,14 +165,14 @@ define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_slt:
 ; GCN: v_cmp_lt_i64_e64
-define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i64_sle:
 ; GCN: v_cmp_le_i64_e64
-define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
index a65f422742c..a9351dbb27d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}gather4_v2:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl:
 ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l:
 ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b:
 ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_v8:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_v2:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -86,7 +86,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_o:
 ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -95,7 +95,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -104,7 +104,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o_v8:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -113,7 +113,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -122,7 +122,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o_v8:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -131,7 +131,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -140,7 +140,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o_v8:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -149,7 +149,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_o:
 ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -158,7 +158,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_o:
 ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -168,7 +168,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c:
 ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -177,7 +177,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -186,7 +186,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_v8:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -195,7 +195,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -204,7 +204,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_v8:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -222,7 +222,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_v8:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -231,7 +231,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl:
 ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -240,7 +240,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz:
 ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -250,7 +250,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -259,7 +259,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o_v8:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -268,7 +268,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_o:
 ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -277,7 +277,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_o:
 ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -286,7 +286,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_o:
 ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -295,7 +295,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl_o:
 ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -304,7 +304,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o_v8:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -322,7 +322,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_f32:
 ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store float %r, float addrspace(1)* %out
@@ -331,7 +331,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_v2f32:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
index c7a04614ade..2e78e2a4c6f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}getlod:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v2:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v4:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
index e6412b3ec58..4f90b0a25ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -183,7 +183,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_f32:
 ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
-define void @sample_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0)
   store float %r, float addrspace(1)* %out
@@ -192,7 +192,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_v2f32:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3
-define void @sample_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
@@ -201,7 +201,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_0:
 ; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}}
-define void @adjust_writemask_sample_0(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -211,7 +211,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_01:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}}
-define void @adjust_writemask_sample_01(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -223,7 +223,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_012:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}}
-define void @adjust_writemask_sample_012(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -237,7 +237,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_12:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}}
-define void @adjust_writemask_sample_12(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -249,7 +249,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_03:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}}
-define void @adjust_writemask_sample_03(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -261,7 +261,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_13:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}}
-define void @adjust_writemask_sample_13(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -273,7 +273,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}adjust_writemask_sample_123:
 ; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}}
-define void @adjust_writemask_sample_123(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   %elt1 = extractelement <4 x float> %r, i32 1
@@ -288,7 +288,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
+define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -300,7 +300,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -311,7 +311,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -322,7 +322,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -333,7 +333,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -344,7 +344,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -355,7 +355,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -366,7 +366,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -377,7 +377,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -388,7 +388,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -399,7 +399,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
index fd5e1389176..42d7bc0e777 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -184,7 +184,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -195,7 +195,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -206,7 +206,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -217,7 +217,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -228,7 +228,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -239,7 +239,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -250,7 +250,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -261,7 +261,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -272,7 +272,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -283,7 +283,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -294,7 +294,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -305,7 +305,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -316,7 +316,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -327,7 +327,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -338,7 +338,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -349,7 +349,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -360,7 +360,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -371,7 +371,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -382,7 +382,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
@@ -393,7 +393,7 @@ main_body:
 ; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled:
 ; GCN-NOT: image
 ; GCN-NOT: store
-define void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
   %elt0 = extractelement <4 x float> %r, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5d4d4cd7ee4..055dddbfa8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -8,7 +8,7 @@
 ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
 
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
-define void @test(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
   %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -20,7 +20,7 @@ define void @test(i32 addrspace(1)* %out) #1 {
 ; ALL-LABEL: {{^}}test_implicit:
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -39,7 +39,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
+define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %arg.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 6720cbe9d8d..fe211d35607 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16(
+define amdgpu_kernel void @ldexp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -22,7 +22,7 @@ define void @ldexp_f16(
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_a(
+define amdgpu_kernel void @ldexp_f16_imm_a(
     half addrspace(1)* %r,
     i32 addrspace(1)* %b) {
   %b.val = load i32, i32 addrspace(1)* %b
@@ -35,7 +35,7 @@ define void @ldexp_f16_imm_a(
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_b(
+define amdgpu_kernel void @ldexp_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
   %a.val = load half, half addrspace(1)* %a
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
index a23defd742a..1ab4e8b8063 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -7,7 +7,7 @@ declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone
 ; SI-LABEL: {{^}}test_ldexp_f32:
 ; SI: v_ldexp_f32
 ; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind
 ; SI-LABEL: {{^}}test_ldexp_f64:
 ; SI: v_ldexp_f64
 ; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
   %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -24,7 +24,7 @@ define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwi
 
 ; SI-LABEL: {{^}}test_ldexp_undef_f32:
 ; SI-NOT: v_ldexp_f32
-define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 014369b4501..bc599897f82 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_lerp:
 ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
   %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index f78257f1d22..feecd6c0e35 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.log.clamp.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_log_clamp_f32:
 ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
   %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
   store float %log.clamp, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 35fdba8f34a..8baaad19040 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -7,7 +7,7 @@
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp1, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
   %cmp = fcmp oeq float %cond, 0.0
   br i1 %cmp, label %if, label %else
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 7c2495e096e..3a2b87cd87f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 04bb97a9eb5..a8d03bf6bba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -5,7 +5,7 @@ declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -37,7 +37,7 @@ define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i6
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
 
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 83d13ab2684..dfaac042227 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_msad_u8:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_msad_u8_non_immediate:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index ece4224f6e6..be71225c5e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 6bf871543ca..9200fe7c67b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index f0b8e2a0293..0f1fa15f47c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rcp.f16(half %a)
 ; VI:  v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rcp_f16(
+define amdgpu_kernel void @rcp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index d53861456c7..71db76d902b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.rcp.legacy(float) #0
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32:
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_undef_f32:
 ; GCN-NOT: v_rcp_legacy_f32
-define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 9f89e563cb8..ad2d84b7911 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -8,7 +8,7 @@ declare float @llvm.sqrt.f32(float) #0
 
 ; FUNC-LABEL: {{^}}rcp_undef_f32:
 ; SI-NOT: v_rcp_f32
-define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -17,7 +17,7 @@ define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
 ; FUNC-LABEL: {{^}}rcp_2_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5
-define void @rcp_2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @rcp_2_f32(float addrspace(1)* %out) #1 {
 ; FUNC-LABEL: {{^}}rcp_10_f32:
 ; SI-NOT: v_rcp_f32
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd
-define void @rcp_10_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -36,7 +36,7 @@ define void @rcp_10_f32(float addrspace(1)* %out) #1 {
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -46,7 +46,7 @@ define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -54,7 +54,7 @@ define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src
 
 ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
 ; SI: v_div_scale_f32
-define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -63,7 +63,7 @@ define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %s
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_sqrt_f32_e32
 ; SI: v_rcp_f32_e32
-define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@ define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -83,7 +83,7 @@ define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -93,7 +93,7 @@ define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
 
 ; FUNC-LABEL: {{^}}rcp_pat_f64:
 ; SI: v_div_scale_f64
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -111,7 +111,7 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -121,7 +121,7 @@ define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
@@ -132,7 +132,7 @@ define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 2569108e7b1..9f5c809455e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
 ; CHECK-LABEL: {{^}}test_readfirstlane_imm:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 436ffff692c..5e892fad374 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1
 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
@@ -34,7 +34,7 @@ define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 
 ; CHECK-LABEL: {{^}}test_readlane_imm:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 5f40e0d0986..3611047f127 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -12,7 +12,7 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
 ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
-define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
-define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
 
 ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
 ; SI-NOT: v_rsq_clamp_f32
-define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 2022d028986..fd480214081 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rsq.f16(half %a)
 ; VI:  v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rsq_f16(
+define amdgpu_kernel void @rsq_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 47bd0d82b83..7f4c2cb19a3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -4,7 +4,7 @@ declare float @llvm.amdgcn.rsq.legacy(float) #0
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32:
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
 ; SI-NOT: v_rsq_legacy_f32
-define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index c644288977a..0ce26d0fe87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.rsq.f64(double) #0
 
 ; FUNC-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -23,7 +23,7 @@ define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f64:
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_undef_f32:
 ; SI-NOT: v_rsq_f32
-define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 9559b5a84b1..5f8ca28ec5f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -7,7 +7,7 @@
 ; GFX9: flat_store_dword
 ; GFX9-NOT: s_waitcnt
 ; GCN: s_barrier
-define void @test_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index ecd4ac6824c..b488565c6b3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7]
 ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv() #0 {
+define amdgpu_kernel void @test_s_dcache_inv() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN: s_dcache_inv
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index 097f35d42c4..a3a5c329f41 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7]
 ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv_vol() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 9ecce7463f6..909a85dda3e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb() #0 {
+define amdgpu_kernel void @test_s_dcache_wb() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 943f8c67a2e..217bf97c41a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb_vol() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
index 72513fc86f4..8f64c50b9c6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.decperflevel(i32) #0
 ; GCN: s_decperflevel 13{{$}}
 ; GCN: s_decperflevel 14{{$}}
 ; GCN: s_decperflevel 15{{$}}
-define void @test_s_decperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.decperflevel(i32 0)
   call void @llvm.amdgcn.s.decperflevel(i32 1)
   call void @llvm.amdgcn.s.decperflevel(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 4304398182a..906a8a3e05f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574)
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size
 ; Call site has additional readnone knowledge.
 ; GCN-LABEL: {{^}}readnone_s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
index 2ae4fc473ea..49e6e425790 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.incperflevel(i32) #0
 ; GCN: s_incperflevel 13{{$}}
 ; GCN: s_incperflevel 14{{$}}
 ; GCN: s_incperflevel 15{{$}}
-define void @test_s_incperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.incperflevel(i32 0)
   call void @llvm.amdgcn.s.incperflevel(i32 1)
   call void @llvm.amdgcn.s.incperflevel(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index d8eda10fdfd..66041037168 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -10,7 +10,7 @@ declare i64 @llvm.amdgcn.s.memrealtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: _store_dwordx2
-define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index ff9d7461978..6aef769bafa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -11,7 +11,7 @@ declare i64 @llvm.amdgcn.s.memtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2
-define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
index 870aa48a341..59c910c71c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.sleep(i32) #0
 ; GCN: s_sleep 13{{$}}
 ; GCN: s_sleep 14{{$}}
 ; GCN: s_sleep 15{{$}}
-define void @test_s_sleep(i32 %x) #0 {
+define amdgpu_kernel void @test_s_sleep(i32 %x) #0 {
   call void @llvm.amdgcn.s.sleep(i32 0)
   call void @llvm.amdgcn.s.sleep(i32 1)
   call void @llvm.amdgcn.s.sleep(i32 2)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 3aaed9d5377..2a3705de2b4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index 5438571c582..c404531513e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u16:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u16_non_immediate:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index 9422d7620ca..1ee876aa724 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u8:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u8_non_immediate:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
index c51c1c957f8..4e232a3e152 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -11,7 +11,7 @@ define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #
 
 ; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #
 
 ; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg:
 ; GCN: v_bfe_i32
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #
 
 ; GCN-LABEL: {{^}}v_bfe_print_arg:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
   %load = load i32, i32 addrspace(1)* %src0, align 4
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@ define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -54,7 +54,7 @@ define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -64,7 +64,7 @@ define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
@@ -78,7 +78,7 @@ define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
@@ -90,7 +90,7 @@ define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: buffer_load_dword
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -103,7 +103,7 @@ define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -115,7 +115,7 @@ define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -127,7 +127,7 @@ define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -139,7 +139,7 @@ define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -150,7 +150,7 @@ define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -161,7 +161,7 @@ define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-NOT: lshr
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
@@ -173,7 +173,7 @@ define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -195,7 +195,7 @@ define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -206,7 +206,7 @@ define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -217,7 +217,7 @@ define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -228,7 +228,7 @@ define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -261,7 +261,7 @@ define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@ define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -283,7 +283,7 @@ define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -294,7 +294,7 @@ define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -305,7 +305,7 @@ define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -316,7 +316,7 @@ define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -327,7 +327,7 @@ define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -338,7 +338,7 @@ define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -349,7 +349,7 @@ define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -360,7 +360,7 @@ define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -371,7 +371,7 @@ define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
   %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
   store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
   ret void
@@ -383,7 +383,7 @@ define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
 ; GCN-NOT: v_ashr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
 ; GCN: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
   %shl = shl i32 %bfe, 8
@@ -399,7 +399,7 @@ define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
 ; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
 ; GCN: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
   %div = sdiv i32 %bfe, 2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
index a1030b502bb..94aeb077ebe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
@@ -5,7 +5,7 @@
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_interrupt() {
+define amdgpu_kernel void @test_interrupt() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
   ret void
@@ -15,7 +15,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit() {
+define amdgpu_kernel void @test_gs_emit() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
   ret void
@@ -25,7 +25,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut() {
+define amdgpu_kernel void @test_gs_cut() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
   ret void
@@ -35,7 +35,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut() {
+define amdgpu_kernel void @test_gs_emit_cut() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
   ret void
@@ -45,7 +45,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done() {
+define amdgpu_kernel void @test_gs_done() {
 body:
   call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
   ret void
@@ -66,7 +66,7 @@ define amdgpu_gs void @sendmsg(i32 inreg %a) #0 {
 ; VI-NEXT: s_nop 0
 ; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
 ; GCN-NEXT: s_endpgm
-define void @sendmsghalt(i32 inreg %a) #0 {
+define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 {
   call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
   ret void
 }
@@ -75,7 +75,7 @@ define void @sendmsghalt(i32 inreg %a) #0 {
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-define void @test_interrupt_halt() {
+define amdgpu_kernel void @test_interrupt_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
   ret void
@@ -85,7 +85,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit_halt() {
+define amdgpu_kernel void @test_gs_emit_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
   ret void
@@ -95,7 +95,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut_halt() {
+define amdgpu_kernel void @test_gs_cut_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
   ret void
@@ -105,7 +105,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut_halt() {
+define amdgpu_kernel void @test_gs_emit_cut_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
   ret void
@@ -115,7 +115,7 @@ body:
 ; GCN: s_mov_b32 m0, 0
 ; GCN-NOT: s_mov_b32 m0
 ; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done_halt() {
+define amdgpu_kernel void @test_gs_done_halt() {
 body:
   call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index 09f8f485869..495e36b09f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
 ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
-define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
@@ -18,7 +18,7 @@ define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index fac0e352614..4b930bfa210 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.sin.f16(half %a)
 ; VI:  v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index e3692fc5906..0b7064da23f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.sin.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_sin_f32:
 ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
   %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
   store float %sin, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index caac6ddbeb8..e0cec2134e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
@@ -22,7 +22,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
index 8a2990a8cbd..92e3a1099da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -11,7 +11,7 @@ define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #
 
 ; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #
 
 ; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg:
 ; GCN: v_bfe_u32
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -36,7 +36,7 @@ define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #
 ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@ define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -55,7 +55,7 @@ define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i
 ; GCN: buffer_load_ubyte
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %load = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %load to i32
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
@@ -70,7 +70,7 @@ define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #
 ; FIXME: Should be using s_add_i32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -85,7 +85,7 @@ define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-NEXT: v_and_b32_e32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -99,7 +99,7 @@ define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; GCN: v_add_i32
 ; GCN: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -114,7 +114,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspa
 ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -129,7 +129,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspa
 ; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 255
@@ -143,7 +143,7 @@ define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspa
 ; GCN: v_add_i32
 ; GCN-NEXT: bfe
 ; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 1
   %ext = and i32 %add, 65535
@@ -156,14 +156,14 @@ define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrsp
 ; GCN: buffer_load_dword
 ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
@@ -171,7 +171,7 @@ define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   ret void
 }
 
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
@@ -186,7 +186,7 @@ define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = lshr i32 %shl, 31
@@ -201,7 +201,7 @@ define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-NOT: shr
 ; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -214,7 +214,7 @@ define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
@@ -226,7 +226,7 @@ define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
@@ -239,7 +239,7 @@ define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -252,7 +252,7 @@ define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -264,7 +264,7 @@ define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -276,7 +276,7 @@ define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -288,7 +288,7 @@ define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -299,7 +299,7 @@ define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = ashr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -310,7 +310,7 @@ define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN-NOT: lshr
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = lshr i32 %x, 31
   %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
@@ -323,7 +323,7 @@ define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -335,7 +335,7 @@ define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -347,7 +347,7 @@ define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -359,7 +359,7 @@ define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -371,7 +371,7 @@ define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -383,7 +383,7 @@ define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -395,7 +395,7 @@ define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -407,7 +407,7 @@ define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -419,7 +419,7 @@ define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -431,7 +431,7 @@ define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -443,7 +443,7 @@ define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -455,7 +455,7 @@ define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -467,7 +467,7 @@ define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -479,7 +479,7 @@ define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -491,7 +491,7 @@ define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -503,7 +503,7 @@ define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -515,7 +515,7 @@ define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -527,7 +527,7 @@ define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -539,7 +539,7 @@ define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_dword [[VREG]],
 ; GCN: s_endpgm
 ; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
   %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
   store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
   ret void
@@ -556,7 +556,7 @@ define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
 ; GCN-DAG: buffer_store_dword [[AND]]
 ; GCN-DAG: buffer_store_dword [[BFE]]
 ; GCN: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
                                             i32 addrspace(1)* %out1,
                                             i32 addrspace(1)* %in) #0 {
   %src = load i32, i32 addrspace(1)* %in, align 4
@@ -570,7 +570,7 @@ define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
 ; GCN-LABEL: {{^}}lshr_and:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = lshr i32 %a, 6
   %c = and i32 %b, 7
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -580,7 +580,7 @@ define void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GCN-LABEL: {{^}}v_lshr_and:
 ; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
 ; GCN: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = lshr i32 %a, %b
   %d = and i32 %c, 7
   store i32 %d, i32 addrspace(1)* %out, align 8
@@ -590,7 +590,7 @@ define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN-LABEL: {{^}}and_lshr:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = and i32 %a, 448
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -600,7 +600,7 @@ define void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GCN-LABEL: {{^}}and_lshr2:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
 ; GCN: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = and i32 %a, 511
   %c = lshr i32 %b, 6
   store i32 %c, i32 addrspace(1)* %out, align 8
@@ -610,7 +610,7 @@ define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GCN-LABEL: {{^}}shl_lshr:
 ; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
 ; GCN: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
   %b = shl i32 %a, 9
   %c = lshr i32 %b, 11
   store i32 %c, i32 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
index e8517975537..e305f8eff58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
@@ -4,7 +4,7 @@
 ; GCN-DAG: ; wave barrier
 ; GCN-NOT: s_barrier
 
-define void @test_wave_barrier() #0 {
+define amdgpu_kernel void @test_wave_barrier() #0 {
 entry:
   call void @llvm.amdgcn.wave.barrier() #1
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 58529b87444..349e7f0f0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -34,7 +34,7 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@ define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -96,7 +96,7 @@ define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 1f18173f40a..8b80998cab6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 ; ALL-NOT: v0
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0
-define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v1
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1
-define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v2
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2
-define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 6d5147b67a5..ad6534b143b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
 ; VI:  v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @ceil_f16(
+define amdgpu_kernel void @ceil_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @ceil_v2f16(
+define amdgpu_kernel void @ceil_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index 58072631212..60c07215ae5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -53,7 +53,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @cos_v2f16(
+define amdgpu_kernel void @cos_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
index eb7dcbbf234..bd89502d7b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -11,7 +11,7 @@
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @test(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
    store float %cos, float addrspace(1)* %out
    ret void
@@ -29,7 +29,7 @@ define void @test(float addrspace(1)* %out, float %x) #1 {
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
    store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 8b0854c2c2b..c4a76de5989 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -9,7 +9,7 @@
 
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
-define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
+define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14
   store i32 123, i32 addrspace(1)* %globalptr_arg, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 5b87e89a0a4..586c06420e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
 ; VI:  v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @exp2_f16(
+define amdgpu_kernel void @exp2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @exp2_v2f16(
+define amdgpu_kernel void @exp2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index e10b101dc3f..387dc3b8566 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_exp_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.exp2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 ;SI: v_exp_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 3a5f86d90f6..a62cab932dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
 ; VI:  v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @floor_f16(
+define amdgpu_kernel void @floor_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @floor_v2f16(
+define amdgpu_kernel void @floor_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 9a9b87d3877..a677eb81135 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -16,7 +16,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16(
+define amdgpu_kernel void @fma_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -42,7 +42,7 @@ define void @fma_f16(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_a(
+define amdgpu_kernel void @fma_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -65,7 +65,7 @@ define void @fma_f16_imm_a(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_b(
+define amdgpu_kernel void @fma_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -88,7 +88,7 @@ define void @fma_f16_imm_b(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_c(
+define amdgpu_kernel void @fma_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -130,7 +130,7 @@ define void @fma_f16_imm_c(
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16(
+define amdgpu_kernel void @fma_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -165,7 +165,7 @@ define void @fma_v2f16(
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_a(
+define amdgpu_kernel void @fma_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c) {
@@ -203,7 +203,7 @@ define void @fma_v2f16_imm_a(
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_b(
+define amdgpu_kernel void @fma_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c) {
@@ -241,7 +241,7 @@ define void @fma_v2f16_imm_b(
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_c(
+define amdgpu_kernel void @fma_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index d1d618bea69..9d01571fc2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -24,7 +24,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16(
+define amdgpu_kernel void @fmuladd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -54,7 +54,7 @@ define void @fmuladd_f16(
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_a(
+define amdgpu_kernel void @fmuladd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -83,7 +83,7 @@ define void @fmuladd_f16_imm_a(
 
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_b(
+define amdgpu_kernel void @fmuladd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -135,7 +135,7 @@ define void @fmuladd_f16_imm_b(
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmuladd_v2f16(
+define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index a534ee032cb..a12ebcb7e38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
 ; VI:  v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @log2_f16(
+define amdgpu_kernel void @log2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @log2_v2f16(
+define amdgpu_kernel void @log2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index c75e7850b35..b9d593e43f3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_log_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.log2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 ;SI: v_log_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 86ee8f11ace..4c93c776a88 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16(
+define amdgpu_kernel void @maxnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@ entry:
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_a(
+define amdgpu_kernel void @maxnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@ entry:
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_b(
+define amdgpu_kernel void @maxnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -87,7 +87,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16(
+define amdgpu_kernel void @maxnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -115,7 +115,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_a(
+define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -141,7 +141,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_b(
+define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 009338d273f..7b4db55155e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -80,7 +80,7 @@ declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI-DAG: ds_write_b8
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
@@ -125,7 +125,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
 ; SI-DAG: ds_write_b16
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
@@ -144,7 +144,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b32
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
@@ -161,7 +161,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
@@ -238,7 +238,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %
 ; SI-DAG: buffer_store_byte
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
@@ -281,7 +281,7 @@ define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noa
 ; SI-DAG: buffer_store_short
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
@@ -294,7 +294,7 @@ define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
@@ -307,7 +307,7 @@ define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
@@ -320,7 +320,7 @@ define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
@@ -340,7 +340,7 @@ define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* no
 ; SI-DAG: s_load_dwordx2
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
-define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false)
   ret void
@@ -365,7 +365,7 @@ define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) noun
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
-define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index a69d99c8c95..d058256fa73 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16(
+define amdgpu_kernel void @minnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@ entry:
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_a(
+define amdgpu_kernel void @minnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@ entry:
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_b(
+define amdgpu_kernel void @minnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -86,7 +86,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -117,7 +117,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_a(
+define amdgpu_kernel void @minnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -143,7 +143,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_b(
+define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 4db29c58385..de8a47741c9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -2,7 +2,7 @@
 
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
 
-define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
   %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
   %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
   %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
index e4e6dd8e106..93caafbb952 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -2,7 +2,7 @@
 
 ; EG-LABEL: {{^}}test_group_barrier:
 ; EG: GROUP_BARRIER
-define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index a5b07e072fa..82c42601ef1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -14,7 +14,7 @@
 
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -78,7 +78,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -98,7 +98,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_yz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) {
 entry:
   %y = call i32 @llvm.r600.read.local.size.y() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -121,7 +121,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xyz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -138,7 +138,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.x() #0
   %shl = shl i32 %size, 16
@@ -153,7 +153,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.y() #0
   %shl = shl i32 %size, 16
@@ -168,7 +168,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.z() #0
   %shl = shl i32 %size, 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
index 1c6e7950e9b..90d076d4fb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}rsq_clamped_f32:
 ; EG: RECIPSQRT_CLAMPED
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
   store float %rsq_clamped, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
index 1d6bff01e66..d9177b39b8a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32:
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
 ; TODO: Really these should be constant folded
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
index 409037f3e97..98044917e2b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -17,7 +17,7 @@
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
 bb:
   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
   %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index c8c6e2837ca..8f6007c7d4b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
 ; VI:  v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rint_f16(
+define amdgpu_kernel void @rint_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @rint_v2f16(
+define amdgpu_kernel void @rint_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index c63fb172794..30ce8ed83ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -11,7 +11,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 ; SI: s_endpgm
-define void @rint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; FUNC-LABEL: {{^}}rint_v2f64:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
index cf7c0e4c6fb..4056bc39448 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -6,7 +6,7 @@
 ; R600: RNDNE
 
 ; SI: v_rndne_f32_e32
-define void @rint_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3ea4551f0ee..c58b9b4d9e9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}round_f64:
 ; SI: s_endpgm
-define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
   %result = call double @llvm.round.f64(double %x) #1
   store double %result, double addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@ define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0
 
 ; FUNC-LABEL: {{^}}round_v2f64:
 ; SI: s_endpgm
-define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
   %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v4f64:
 ; SI: s_endpgm
-define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v8f64:
 ; SI: s_endpgm
-define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
index 7e8f8ff172e..3cf86c38b6b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.ll
@@ -20,7 +20,7 @@
 ; R600-DAG: SETGE
 ; R600-DAG: CNDE
 ; R600-DAG: ADD
-define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 {
   %result = call float @llvm.round.f32(float %x) #1
   store float %result, float addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @round_f32(float addrspace(1)* %out, float %x) #0 {
 ; FUNC-LABEL: {{^}}round_v2f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
   %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
 ; FUNC-LABEL: {{^}}round_v4f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
   %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
 ; FUNC-LABEL: {{^}}round_v8f32:
 ; SI: s_endpgm
 ; R600: CF_END
-define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
   %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index d17df13d75d..e89e545dbbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -52,7 +52,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sin_v2f16(
+define amdgpu_kernel void @sin_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
index 04754396a0f..2a17303267b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -12,7 +12,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
@@ -24,7 +24,7 @@ define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 74373ff0167..64bd5db5cdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
 ; VI:  v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sqrt_f16(
+define amdgpu_kernel void @sqrt_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sqrt_v2f16(
+define amdgpu_kernel void @sqrt_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index ee6eb5074a1..bcfe868ad57 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
 ; VI:  v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @trunc_f16(
+define amdgpu_kernel void @trunc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @trunc_v2f16(
+define amdgpu_kernel void @trunc_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 1b42a9e96e0..0050d1a4f87 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
   %ld = load double, double addrspace(2)* %in
   store double %ld, double addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 104af10036c..c8abe5c77ee 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %load = load i1, i1 addrspace(2)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspa
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index f7be1291040..5dd2efdf638 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -8,7 +8,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %ld = load i16, i16 addrspace(2)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -31,7 +31,7 @@ entry:
 
 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -53,7 +53,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -80,7 +80,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -109,7 +109,7 @@ define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -140,7 +140,7 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EG: 16
 ; EG: 16
-define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -160,7 +160,7 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -183,7 +183,7 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -204,7 +204,7 @@ entry:
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -229,7 +229,7 @@ entry:
 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -254,7 +254,7 @@ define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -288,7 +288,7 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 65535
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -322,7 +322,7 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
-define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -352,7 +352,7 @@ define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
-define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -369,7 +369,7 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -385,7 +385,7 @@ define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -404,7 +404,7 @@ define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -421,7 +421,7 @@ define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -438,7 +438,7 @@ define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -464,7 +464,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -475,7 +475,7 @@ define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -488,7 +488,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -498,7 +498,7 @@ define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -508,7 +508,7 @@ define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -518,7 +518,7 @@ define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -528,7 +528,7 @@ define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -538,7 +538,7 @@ define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -548,7 +548,7 @@ define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -559,7 +559,7 @@ define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -583,7 +583,7 @@ define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -596,7 +596,7 @@ define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; These trigger undefined register machine verifier errors
 
 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
-; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -614,7 +614,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; }
 
 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
-; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index d1ff1c706c4..7370d45ca6b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(2)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -81,7 +81,7 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; GCN-DAG: s_ashr_i32
 
 ; GCN: store_dwordx4
-define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -155,7 +155,7 @@ define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -191,7 +191,7 @@ define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -219,7 +219,7 @@ define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -319,7 +319,7 @@ define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 
-define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -370,7 +370,7 @@ define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 0d071a10b49..14e50ea4c3c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
   %ld = load i64, i64 addrspace(2)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -74,7 +74,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9fdc4ebfd85..6e56b9f9b6d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -10,7 +10,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(2)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %ld = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -150,7 +150,7 @@ define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -170,7 +170,7 @@ define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -193,7 +193,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,7 +236,7 @@ define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -264,7 +264,7 @@ define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -335,7 +335,7 @@ define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -378,7 +378,7 @@ define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -450,7 +450,7 @@ define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -526,7 +526,7 @@ define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -539,7 +539,7 @@ define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -552,7 +552,7 @@ define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -589,7 +589,7 @@ define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -600,7 +600,7 @@ define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -613,7 +613,7 @@ define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -623,7 +623,7 @@ define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -643,7 +643,7 @@ define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -653,7 +653,7 @@ define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -663,7 +663,7 @@ define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -673,7 +673,7 @@ define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -683,7 +683,7 @@ define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -693,7 +693,7 @@ define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -704,7 +704,7 @@ define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -715,7 +715,7 @@ define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -723,7 +723,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -731,7 +731,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -744,7 +744,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -759,7 +759,7 @@ define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -767,7 +767,7 @@ define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -788,7 +788,7 @@ define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -800,7 +800,7 @@ define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -810,7 +810,7 @@ define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -824,7 +824,7 @@ define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -834,7 +834,7 @@ define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -853,7 +853,7 @@ define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 
-define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -889,7 +889,7 @@ define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -900,7 +900,7 @@ define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -943,7 +943,7 @@ define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -951,7 +951,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -959,7 +959,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index 805c0a7a39c..bd6fea587b4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(1)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; R600: VTX_READ_64
-define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -46,7 +46,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
@@ -83,7 +83,7 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
index dc1a9432283..5b772e1fe5e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -8,7 +8,7 @@
 
 ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %ld = load double, double addrspace(1)* %in
   store double %ld, double addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in
 ; FUNC-LABEL: {{^}}global_load_v2f64:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(1)* %in
   store <2 x double> %ld, <2 x double> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(1)* %in
   store <3 x double> %ld, <3 x double> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(1)* %in
   store <4 x double> %ld, <4 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(1)* %in
   store <8 x double> %ld, <8 x double> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(1)* %in
   store <16 x double> %ld, <16 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
index e2e90cac8cc..cb3536a0c12 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %load = load i1, i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2i1:
-define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i1:
-define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v4i1:
-define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v8i1:
-define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v16i1:
-define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v32i1:
-define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v64i1:
-define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
-define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
-define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
-define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
-define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
-define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
-define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
-define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
-define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
-define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
-define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
-define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
-define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
-define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
-define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
-define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
-define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
 ; GCN: buffer_store_dwordx2
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
-define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
-define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
-define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
-define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
-define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
-define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
-define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
-define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
-define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
-define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
-define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
-define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
-define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
-define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
-define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
-define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 88d6b7b99d3..dcdd1a947cd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -11,7 +11,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %ld = load i16, i16 addrspace(1)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -76,7 +76,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -91,7 +91,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -134,7 +134,7 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -148,7 +148,7 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EGCM: 16
-define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -190,7 +190,7 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM: 16
 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
-define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -237,7 +237,7 @@ entry:
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
 ; EGCM-DAG: 16
-define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -262,7 +262,7 @@ define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -296,7 +296,7 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -330,7 +330,7 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -346,7 +346,7 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -357,7 +357,7 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -379,7 +379,7 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -401,7 +401,7 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -435,7 +435,7 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -452,7 +452,7 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -469,7 +469,7 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -495,7 +495,7 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -506,7 +506,7 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -519,7 +519,7 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -527,7 +527,7 @@ define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
-define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -537,7 +537,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -547,7 +547,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -557,7 +557,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -567,7 +567,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -577,7 +577,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -588,7 +588,7 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -599,7 +599,7 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -612,7 +612,7 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
-; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -641,7 +641,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
-; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index e3335347a63..5df32c1e312 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -9,7 +9,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(1)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -45,7 +45,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -82,7 +82,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -98,7 +98,7 @@ entry:
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -117,7 +117,7 @@ define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -130,7 +130,7 @@ define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 
 ; GCN-HSA: flat_load_dword
 ; GCN-HSA: flat_store_dwordx2
-define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -156,7 +156,7 @@ define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; GCN-HSA: flat_load_dwordx2
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -208,7 +208,7 @@ define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -263,7 +263,7 @@ define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -309,7 +309,7 @@ define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-DAG: v_ashrrev_i32
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -344,7 +344,7 @@ define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -444,7 +444,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 
-define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -511,7 +511,7 @@ define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
index dd4ce2c10eb..de16b6c8997 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -13,7 +13,7 @@
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
 
 ; EG: VTX_READ_64
-define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %ld = load i64, i64 addrspace(1)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index c880700f347..71adf090532 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -11,7 +11,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(1)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %ld = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; TODO: These should use DST, but for some there are redundant MOVs
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
 ; EG-DAG: 8
-define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -174,7 +174,7 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -207,7 +207,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -227,7 +227,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -248,7 +248,7 @@ define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -300,7 +300,7 @@ define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -341,7 +341,7 @@ define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -384,7 +384,7 @@ define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -456,7 +456,7 @@ define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -532,7 +532,7 @@ define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -545,7 +545,7 @@ define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -558,7 +558,7 @@ define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -576,7 +576,7 @@ define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -595,7 +595,7 @@ define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -619,7 +619,7 @@ define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -629,7 +629,7 @@ define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -639,7 +639,7 @@ define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -649,7 +649,7 @@ define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -659,7 +659,7 @@ define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -669,7 +669,7 @@ define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -679,7 +679,7 @@ define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -689,7 +689,7 @@ define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -699,7 +699,7 @@ define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -710,7 +710,7 @@ define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -721,7 +721,7 @@ define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -729,7 +729,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
-; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -737,7 +737,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
-; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -752,7 +752,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -768,7 +768,7 @@ define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -789,7 +789,7 @@ define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -799,7 +799,7 @@ define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -811,7 +811,7 @@ define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -821,7 +821,7 @@ define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -835,7 +835,7 @@ define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -845,7 +845,7 @@ define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -873,7 +873,7 @@ define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -899,7 +899,7 @@ define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -910,7 +910,7 @@ define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -953,7 +953,7 @@ define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -961,7 +961,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
-; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -969,7 +969,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
-; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
index 77b5e3cf3ae..09d7145424d 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(3)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
@@ -52,7 +52,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
@@ -100,7 +100,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
index 27d39b7e9d7..9ad6c087bf2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
   %ld = load double, double addrspace(3)* %in
   store double %ld, double addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in)
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(3)* %in
   store <2 x double> %ld, <2 x double> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(3)* %in
   store <3 x double> %ld, <3 x double> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(3)* %in
   store <4 x double> %ld, <4 x double> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(3)* %in
   store <8 x double> %ld, <8 x double> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(3)* %in
   store <16 x double> %ld, <16 x double> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
index 2eed9917b5e..e8f134b1fb2 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -10,56 +10,56 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
-define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %load = load i1, i1 addrspace(3)* %in
   store i1 %load, i1 addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v2i1:
-define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   store <2 x i1> %load, <2 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v3i1:
-define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   store <3 x i1> %load, <3 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v4i1:
-define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   store <4 x i1> %load, <4 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v8i1:
-define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   store <8 x i1> %load, <8 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i1:
-define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   store <16 x i1> %load, <16 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v32i1:
-define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   store <32 x i1> %load, <32 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v64i1:
-define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   store <64 x i1> %load, <64 x i1> addrspace(3)* %out
   ret void
@@ -68,7 +68,7 @@ define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(
 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
 ; GCN: ds_read_u8
 ; GCN: ds_write_b32
-define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -82,7 +82,7 @@ define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -90,7 +90,7 @@ define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
-define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
-define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -106,7 +106,7 @@ define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
-define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -114,7 +114,7 @@ define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
-define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -122,7 +122,7 @@ define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
-define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
-define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -138,7 +138,7 @@ define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
-define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -146,7 +146,7 @@ define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
-define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -154,7 +154,7 @@ define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
-define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -162,7 +162,7 @@ define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
-define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -170,7 +170,7 @@ define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
-define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -178,7 +178,7 @@ define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
-define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -186,7 +186,7 @@ define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
-define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
-define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -202,7 +202,7 @@ define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
-define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -210,7 +210,7 @@ define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
-define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -221,7 +221,7 @@ define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN: ds_write_b64
-define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -233,7 +233,7 @@ define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: ds_write_b64
-define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -241,7 +241,7 @@ define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
-define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -249,7 +249,7 @@ define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
-define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -257,7 +257,7 @@ define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
-define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -265,7 +265,7 @@ define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
-define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -273,7 +273,7 @@ define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
-define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -281,7 +281,7 @@ define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
-define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -289,7 +289,7 @@ define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
-define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -297,7 +297,7 @@ define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
-define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -305,7 +305,7 @@ define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
-define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -313,7 +313,7 @@ define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
-define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
-define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -329,7 +329,7 @@ define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
-define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -337,7 +337,7 @@ define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
-define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -345,7 +345,7 @@ define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
-define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -353,7 +353,7 @@ define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
-define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -361,7 +361,7 @@ define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
-define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index d4e86de66af..bbbb34e8d33 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -10,7 +10,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
 entry:
   %ld = load i16, i16 addrspace(3)* %in
   store i16 %ld, i16 addrspace(3)* %out
@@ -25,7 +25,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 
 ; EG-DAG: LDS_USHORT_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
@@ -51,7 +51,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
@@ -65,7 +65,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
@@ -86,7 +86,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
@@ -102,7 +102,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -121,7 +121,7 @@ define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -153,7 +153,7 @@ define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -166,7 +166,7 @@ define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -181,7 +181,7 @@ define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; GCN-DAG: ds_write_b64
 
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -211,7 +211,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -226,7 +226,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -244,7 +244,7 @@ define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -258,7 +258,7 @@ define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -280,7 +280,7 @@ define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -304,7 +304,7 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -340,7 +340,7 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -369,7 +369,7 @@ define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -406,7 +406,7 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -471,7 +471,7 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -512,7 +512,7 @@ define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -531,7 +531,7 @@ define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -558,7 +558,7 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -573,7 +573,7 @@ define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -590,7 +590,7 @@ define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -600,7 +600,7 @@ define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
-define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -623,7 +623,7 @@ define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -638,7 +638,7 @@ define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -651,7 +651,7 @@ define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -672,7 +672,7 @@ define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -689,7 +689,7 @@ define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -722,7 +722,7 @@ define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -747,7 +747,7 @@ define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -804,7 +804,7 @@ define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -812,7 +812,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -820,7 +820,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 280f9658ef8..86055413d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(3)* %in
   store i32 %ld, i32 addrspace(3)* %out
@@ -18,7 +18,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_load_v2i32:
 ; GCN: ds_read_b64
-define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
@@ -28,7 +28,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v3i32:
 ; GCN-DAG: ds_read_b64
 ; GCN-DAG: ds_read_b32
-define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
@@ -38,7 +38,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v4i32:
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
-define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
@@ -48,7 +48,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v8i32:
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
@@ -64,7 +64,7 @@ entry:
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
@@ -72,7 +72,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
-define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -80,7 +80,7 @@ define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
-define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -88,7 +88,7 @@ define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
-define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -96,7 +96,7 @@ define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
-define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -104,7 +104,7 @@ define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
-define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -112,7 +112,7 @@ define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
-define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -120,7 +120,7 @@ define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
-define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -128,7 +128,7 @@ define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
-define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
-define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
-define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -152,7 +152,7 @@ define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
-define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -160,7 +160,7 @@ define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
-define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -168,7 +168,7 @@ define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
-define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -176,7 +176,7 @@ define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
-define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
index 180807df7b9..0c719a9e0bf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
   %ld = load i64, i64 addrspace(3)* %in
   store i64 %ld, i64 addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
index 9ffc74213dd..731996ec6c4 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(3)* %in
   store i8 %ld, i8 addrspace(3)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: DS_READ_RET
-define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %ld = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -108,7 +108,7 @@ define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -119,7 +119,7 @@ define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -156,7 +156,7 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -172,7 +172,7 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -197,7 +197,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -231,7 +231,7 @@ define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -248,7 +248,7 @@ define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -267,7 +267,7 @@ define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -292,7 +292,7 @@ define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -338,7 +338,7 @@ define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -355,7 +355,7 @@ define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -380,7 +380,7 @@ define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -405,7 +405,7 @@ define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -420,7 +420,7 @@ define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
 ; EG: 0.0
-define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -437,7 +437,7 @@ define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -450,7 +450,7 @@ define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: MOV {{.*}}, literal
 ; TODO: merge?
 ; EG: 0.0
-define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -463,7 +463,7 @@ define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -473,7 +473,7 @@ define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -485,7 +485,7 @@ define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -495,7 +495,7 @@ define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -505,7 +505,7 @@ define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -516,7 +516,7 @@ define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -536,7 +536,7 @@ define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -549,7 +549,7 @@ define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -562,7 +562,7 @@ define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -579,7 +579,7 @@ define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -596,7 +596,7 @@ define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -604,7 +604,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
-; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
-; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -625,7 +625,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -639,7 +639,7 @@ define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -650,7 +650,7 @@ define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -662,7 +662,7 @@ define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -673,7 +673,7 @@ define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_WRITE
-define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -686,7 +686,7 @@ define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: BFE_INT
 ; EG: BFE_INT
 ; EG: LDS_WRITE
-define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -698,7 +698,7 @@ define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -715,7 +715,7 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -730,7 +730,7 @@ define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -754,7 +754,7 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -775,7 +775,7 @@ define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -813,7 +813,7 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -846,7 +846,7 @@ define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -908,7 +908,7 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -916,7 +916,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
-; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
@@ -924,7 +924,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
-; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
index bc5e4945fb0..d6162c388b5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -8,7 +8,7 @@
 ; SI: {{flat|buffer}}_load_ubyte
 ; SI: {{flat|buffer}}_load_ushort
 ; SI: {{flat|buffer}}_store_dword
-define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
   %1 = load i24, i24 addrspace(1)* %in
   %2 = zext i24 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
 
 ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
   %1 = load i25, i25 addrspace(1)* %in
   %2 = zext i25 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index a7cee43187c..bf4a93237bd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -5,7 +5,7 @@
 ; BOTH-LABEL: {{^}}local_i32_load
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
   %val = load i32, i32 addrspace(3)* %gep, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -15,7 +15,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32, i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -40,7 +40,7 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
   %val = load i64, i64 addrspace(3)* %gep, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -61,7 +61,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64, i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -71,7 +71,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double, double addrspace(3)* %in, i32 7
   %val = load double, double addrspace(3)* %gep, align 8
   store double %val, double addrspace(1)* %out, align 8
@@ -81,7 +81,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double, double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -90,7 +90,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
@@ -99,7 +99,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
@@ -107,7 +107,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_f64_store(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double, double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
@@ -115,7 +115,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
@@ -124,7 +124,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
 ; BOTH: s_endpgm
-define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -134,7 +134,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
@@ -144,7 +144,7 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
 ; BOTH: s_endpgm
-define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -155,7 +155,7 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics.ll b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
index 6714a28aa43..de029d964b0 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics.ll
@@ -11,7 +11,7 @@
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -37,7 +37,7 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -47,7 +47,7 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -59,7 +59,7 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -73,7 +73,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -84,7 +84,7 @@ define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -96,7 +96,7 @@ define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -109,7 +109,7 @@ define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -119,7 +119,7 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -131,7 +131,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -163,7 +163,7 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -174,7 +174,7 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -195,7 +195,7 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -205,7 +205,7 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -214,7 +214,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
@@ -224,7 +224,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -234,7 +234,7 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -245,7 +245,7 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -255,7 +255,7 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -266,7 +266,7 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -276,7 +276,7 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -287,7 +287,7 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -297,7 +297,7 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -310,7 +310,7 @@ define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -330,7 +330,7 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_u32 [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -338,7 +338,7 @@ define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -348,7 +348,7 @@ define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -360,7 +360,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -369,7 +369,7 @@ define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -379,7 +379,7 @@ define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -390,7 +390,7 @@ define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
 ; GCN: ds_sub_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -398,7 +398,7 @@ define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -408,7 +408,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -417,7 +417,7 @@ define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -426,7 +426,7 @@ define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
 ; GCN: ds_and_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -434,7 +434,7 @@ define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
 ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -443,7 +443,7 @@ define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
 ; GCN: ds_or_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -451,7 +451,7 @@ define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
 ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -460,7 +460,7 @@ define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
 ; GCN: ds_xor_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -468,7 +468,7 @@ define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
 ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -476,7 +476,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -484,7 +484,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
 ; GCN: ds_min_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -492,7 +492,7 @@ define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
 ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -501,7 +501,7 @@ define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
 ; GCN: ds_max_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -509,7 +509,7 @@ define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
 ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -518,7 +518,7 @@ define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
 ; GCN: ds_min_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -526,7 +526,7 @@ define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
 ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -535,7 +535,7 @@ define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
 ; GCN: ds_max_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -543,7 +543,7 @@ define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
 ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
index c88917812ed..6572a7bcd4f 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -13,7 +13,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -23,7 +23,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -38,7 +38,7 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -51,7 +51,7 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -60,7 +60,7 @@ define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
 ; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -70,7 +70,7 @@ define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -92,7 +92,7 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -111,7 +111,7 @@ define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -120,7 +120,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -130,7 +130,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -139,7 +139,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -149,7 +149,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -158,7 +158,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -167,7 +167,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
@@ -176,7 +176,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -185,7 +185,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -195,7 +195,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -204,7 +204,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -214,7 +214,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -233,7 +233,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -252,7 +252,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -260,7 +260,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -269,7 +269,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -282,7 +282,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   ret void
@@ -293,7 +293,7 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -301,7 +301,7 @@ define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
 ; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -310,7 +310,7 @@ define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -329,7 +329,7 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -337,7 +337,7 @@ define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -346,7 +346,7 @@ define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -354,7 +354,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -363,7 +363,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -371,7 +371,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -380,7 +380,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -388,7 +388,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -396,7 +396,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -404,7 +404,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -412,7 +412,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -421,7 +421,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -429,7 +429,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -438,7 +438,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -446,7 +446,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -455,7 +455,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -463,7 +463,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index a57e4f59532..4ce9208eadd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -17,7 +17,7 @@
 ; GCN: s_barrier
 
 ; GCN: ds_read_b32 {{v[0-9]+}},
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -61,7 +61,7 @@ entry:
 
 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.ll b/llvm/test/CodeGen/AMDGPU/local-memory.ll
index 1a11332f865..6124237d763 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.ll
@@ -14,7 +14,7 @@
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
 
 ; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
   %tmp1 = load i32, i32 addrspace(3)* %tmp0
@@ -30,7 +30,7 @@ entry:
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
   %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
index 9841b8882b3..c8f4e4c986a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -15,7 +15,7 @@
 ; EG-NEXT: ALU clause
 
 ; EG: LDS_READ_RET
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -57,7 +57,7 @@ entry:
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/llvm/test/CodeGen/AMDGPU/loop-address.ll b/llvm/test/CodeGen/AMDGPU/loop-address.ll
index f60d574497d..e25d4f4b4f5 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-address.ll
@@ -5,7 +5,7 @@
 ;CHECK: LOOP_BREAK @10
 ;CHECK: POP @10
 
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
index 5fd9806813c..23ddd6488af 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -9,7 +9,7 @@
 ; FUNC: @no_memcpy
 ; R600-NOT: {{^}}llvm.memcpy
 ; SI-NOT: {{^}}llvm.memcpy
-define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
@@ -33,7 +33,7 @@ for.end:
 ; R600-NOT: {{^}}memset_pattern16:
 ; SI-NOT: {{^}}llvm.memset
 ; SI-NOT: {{^}}memset_pattern16:
-define void @no_memset(i32 %size) {
+define amdgpu_kernel void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll
index 97212f5dc12..492472155ee 100644
--- a/llvm/test/CodeGen/AMDGPU/loop_break.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll
@@ -43,7 +43,7 @@
 ; GCN: ; BB#4: ; %bb9
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_endpgm
-define void @break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -87,7 +87,7 @@ bb9:
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @undef_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -140,7 +140,7 @@ bb9:                                              ; preds = %Flow
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -190,7 +190,7 @@ bb9:                                              ; preds = %Flow
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @true_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -240,7 +240,7 @@ bb9:                                              ; preds = %Flow
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
 ; OPT-NEXT: store volatile i32 7
 ; OPT-NEXT: ret void
-define void @false_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -295,7 +295,7 @@ bb9:                                              ; preds = %Flow
 ; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1)
 ; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef
 ; OPT-NEXT: ret void
-define void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
index d5cd10e8518..74564f387ed 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -9,7 +9,7 @@ declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1)
 ; Test the upper bound for sizes to leave
 ; OPT-LABEL: @max_size_small_static_memcpy_caller0(
 ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -21,14 +21,14 @@ define void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addr
 ; OPT-NEXT: load i8
 ; OPT: getelementptr
 ; OPT-NEXT: store i8
-define void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
 }
 
 ; OPT-LABEL: @max_size_small_static_memmove_caller0(
 ; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -39,14 +39,14 @@ define void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 add
 ; OPT-NEXT: load i8
 ; OPT: getelementptr
 ; OPT-NEXT: store i8
-define void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
   call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
   ret void
 }
 
 ; OPT-LABEL: @max_size_small_static_memset_caller0(
 ; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
-define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
   call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
   ret void
 }
@@ -55,7 +55,7 @@ define void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val
 ; OPT-NOT: call
 ; OPT: getelementptr
 ; OPT: store i8
-define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
   call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
   ret void
 }
@@ -63,7 +63,7 @@ define void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val
 ; OPT-LABEL: @variable_memcpy_caller0(
 ; OPT-NOT: call
 ; OPT: phi
-define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   ret void
 }
@@ -71,7 +71,7 @@ define void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %sr
 ; OPT-LABEL: @variable_memcpy_caller1(
 ; OPT-NOT: call
 ; OPT: phi
-define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   ret void
 }
@@ -82,7 +82,7 @@ define void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %sr
 ; OPT-NOT: call
 ; OPT: phi
 ; OPT-NOT: call
-define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
+define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
   ret void
@@ -94,7 +94,7 @@ define void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(
 ; OPT: load i8, i8 addrspace(3)*
 ; OPT: getelementptr inbounds i8, i8 addrspace(1)*
 ; OPT: store i8
-define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
+define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
   ret void
 }
@@ -107,7 +107,7 @@ define void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %
 ; OPT: store i8
 
 ; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
-define void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
+define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index e1fad13e0b5..4e0ecc0565e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
 ; CHECK-NOT: v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 1023
@@ -16,7 +16,7 @@ entry:
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 511
@@ -28,7 +28,7 @@ entry:
 ; CHECK-NOT: v0
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
   %and = and i32 %id, 255
diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
index ee07b0e9766..7a05d6501a5 100644
--- a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -12,7 +12,7 @@
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = lshr <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16
 ; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -55,7 +55,7 @@ define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -70,7 +70,7 @@ define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -84,7 +84,7 @@ define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GCN-LABEL: {{^}}lshr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
-define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -98,7 +98,7 @@ define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GCN-LABEL: {{^}}lshr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -115,7 +115,7 @@ define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -133,7 +133,7 @@ define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)*
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index 9caba32cbac..b855fc500c6 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -31,7 +31,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -70,7 +70,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -107,7 +107,7 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -174,7 +174,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -209,7 +209,7 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -245,7 +245,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -281,7 +281,7 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -319,7 +319,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -362,7 +362,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -404,7 +404,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
 ; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -497,7 +497,7 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -548,7 +548,7 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 9183ae0972d..1e78c4ebcc9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -11,7 +11,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
-define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/mad_int24.ll b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
index f149ea0a6a0..af0159aa9b1 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_int24.ll
@@ -11,7 +11,7 @@
 ; CM: MULADD_INT24
 ; SI-NOT: and
 ; SI: v_mad_i32_i24
-define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 9fde950f822..2c4f7d324a9 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_mad_u32_u24
 ; VI: v_mad_u32_u24
 
-define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -32,7 +32,7 @@ entry:
 ; FIXME: Should be using scalar instructions here.
 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
-define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
@@ -49,7 +49,7 @@ entry:
 ; EG: 8
 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
@@ -68,7 +68,7 @@ entry:
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
 ; SI: v_cndmask
-define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/madak.ll b/llvm/test/CodeGen/AMDGPU/madak.ll
index 6722aa79dd5..eb4066a2a0a 100644
--- a/llvm/test/CodeGen/AMDGPU/madak.ll
+++ b/llvm/test/CodeGen/AMDGPU/madak.ll
@@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -37,7 +37,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
-define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -84,7 +84,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -106,7 +106,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -141,7 +141,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add
 ; GCN-LABEL: {{^}}s_s_madak_f32:
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
   store float %madak, float addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -175,7 +175,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -201,7 +201,7 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/llvm/test/CodeGen/AMDGPU/madmk.ll b/llvm/test/CodeGen/AMDGPU/madmk.ll
index 27fbf58d26c..6e70e95383c 100644
--- a/llvm/test/CodeGen/AMDGPU/madmk.ll
+++ b/llvm/test/CodeGen/AMDGPU/madmk.ll
@@ -12,7 +12,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
-define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -35,7 +35,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
-define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
-define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -83,7 +83,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -97,7 +97,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b)
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
-define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
-define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -151,7 +151,7 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
-define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -172,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
-define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -189,7 +189,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad
 ; SI: s_xor_b64
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
 ; SI: s_or_b64
-define void @kill_madmk_verifier_error() nounwind {
+define amdgpu_kernel void @kill_madmk_verifier_error() nounwind {
 bb:
   br label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll
index d442c8754b7..8fcf85b826f 100644
--- a/llvm/test/CodeGen/AMDGPU/max.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll
@@ -4,7 +4,7 @@
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
 ; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrs
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
@@ -67,7 +67,7 @@ define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrs
 
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrs
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
 ; VIPLUS: v_max_i16_e32
-define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -99,7 +99,7 @@ define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_uge_i16:
 ; VIPLUS: v_max_u16_e32
-define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -115,7 +115,7 @@ define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
 ; VIPLUS: v_max_u16_e32
-define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -133,7 +133,7 @@ define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 ; VI: v_max_u16_e32
 
 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index 5fa307be0fd..ffcdac03bc7 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -6,7 +6,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sge i32 %a, %b
@@ -26,7 +26,7 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
   %cmp = icmp sge <4 x i32> %a, %b
@@ -39,7 +39,7 @@ define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrs
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -50,7 +50,7 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -63,7 +63,7 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp sge i8 %a, %b
@@ -76,7 +76,7 @@ define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -100,7 +100,7 @@ define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sgt i32 %a, %b
@@ -113,7 +113,7 @@ define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -124,7 +124,7 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp uge i32 %a, %b
@@ -137,7 +137,7 @@ define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -155,7 +155,7 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; EG: MAX_UINT
 ; EG: MAX_UINT
 ; EG-NOT: MAX_UINT
-define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
   store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
@@ -168,7 +168,7 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp uge i8 %a, %b
@@ -181,7 +181,7 @@ define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
@@ -194,7 +194,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -207,7 +207,7 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
-define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -243,7 +243,7 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp ugt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -287,7 +287,7 @@ define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp uge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -299,7 +299,7 @@ define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sgt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -311,7 +311,7 @@ define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/max3.ll b/llvm/test/CodeGen/AMDGPU/max3.ll
index a12dba2eb6e..4bb4fd46bec 100644
--- a/llvm/test/CodeGen/AMDGPU/max3.ll
+++ b/llvm/test/CodeGen/AMDGPU/max3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imax3_sgt_i32
 ; SI: v_max3_i32
-define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umax3_ugt_i32
 ; SI: v_max3_u32
-define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
index 97512670f59..1cbd0c32751 100644
--- a/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/llvm/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,7 +9,7 @@ declare i32 @strcmp(i8* nocapture, i8* nocapture) #1
 
 
 ; ERROR: error: <unknown>:0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp
-define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2)
   store volatile i32 %cmp, i32 addrspace(1)* undef
@@ -17,35 +17,35 @@ entry:
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr
-define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
   %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len)
   store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy
-define void @test_strcpy(i8* %dst, i8* %src) #0 {
+define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 {
   %res = call i8* @strcpy(i8* %dst, i8* %src)
   store volatile i8* %res, i8* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp
-define void @test_strcmp(i8* %src0, i8* %src1) #0 {
+define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 {
   %res = call i32 @strcmp(i8* %src0, i8* %src1)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strlen void (i8*): unsupported call to function strlen
-define void @test_strlen(i8* %src) #0 {
+define amdgpu_kernel void @test_strlen(i8* %src) #0 {
   %res = call i32 @strlen(i8* %src)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen
-define void @test_strnlen(i8* %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 {
   %res = call i32 @strnlen(i8* %src, i32 %size)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/merge-stores.ll b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
index fd2d8ee1189..dfd5b97fcc8 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/merge-stores.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -25,7 +25,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -35,7 +35,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -45,7 +45,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -57,7 +57,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -69,7 +69,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -79,7 +79,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -91,7 +91,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -105,7 +105,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -119,7 +119,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -134,7 +134,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -149,7 +149,7 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
 ; GCN-AA: buffer_store_dwordx4 v
 ; GCN: s_endpgm
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -169,7 +169,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 ; SI-DAG: buffer_store_dword
 ; SI-NOT: buffer_store_dword
 ; GCN: s_endpgm
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -181,7 +181,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -192,7 +192,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -207,7 +207,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[LOAD]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -222,7 +222,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -283,7 +283,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; SI-DAG: buffer_store_dword v
 ; SI-DAG: buffer_store_dwordx2 v
 ; GCN: s_endpgm
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -302,7 +302,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -325,7 +325,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -351,7 +351,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -388,7 +388,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -416,7 +416,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: buffer_store_dword [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -446,7 +446,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -470,7 +470,7 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -492,7 +492,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -504,7 +504,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -522,7 +522,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 
 ; GCN: s_endpgm
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -540,7 +540,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -556,7 +556,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dword v
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -596,7 +596,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -630,7 +630,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -646,7 +646,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -662,7 +662,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -679,7 +679,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index c7a89e09588..1aaa827ca94 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -7,7 +7,7 @@
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.pt
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -35,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %val, <4 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
@@ -106,7 +106,7 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
@@ -124,7 +124,7 @@ define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %cmp = icmp sle <2 x i16> %a, %b
   %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %val, <2 x i16> addrspace(1)* %out
@@ -150,7 +150,7 @@ define void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %val, <4 x i16> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -180,7 +180,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GFX89: v_min_i16_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -198,7 +198,7 @@ define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -211,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out
@@ -222,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -233,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -244,7 +244,7 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -267,7 +267,7 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.pt
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
@@ -301,7 +301,7 @@ define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrs
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
@@ -319,7 +319,7 @@ define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrs
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -330,7 +330,7 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
@@ -353,7 +353,7 @@ define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.pt
 ; GFX89: v_min_u16_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
@@ -371,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr,
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -386,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
@@ -404,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %a = load i16, i16 addrspace(1)* %aptr, align 2
   %b = load i16, i16 addrspace(1)* %bptr, align 2
   %cmp = icmp ult i16 %a, %b
@@ -419,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -444,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %val, <8 x i32> addrspace(1)* %out
@@ -478,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %val, <8 x i16> addrspace(1)* %out
@@ -494,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -514,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
@@ -529,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
+define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -542,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -554,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -566,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -578,7 +578,7 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -596,7 +596,7 @@ define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
@@ -621,7 +621,7 @@ define void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrs
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
   %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/min3.ll b/llvm/test/CodeGen/AMDGPU/min3.ll
index 728479ad9f6..59d5d2cdb1a 100644
--- a/llvm/test/CodeGen/AMDGPU/min3.ll
+++ b/llvm/test/CodeGen/AMDGPU/min3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imin3_slt_i32
 ; SI: v_min3_i32
-define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umin3_ult_i32
 ; SI: v_min3_u32
-define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -43,7 +43,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; FUNC-LABEL: @v_test_umin_umin_umin
 ; SI: v_min_i32
 ; SI: v_min3_i32
-define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
@@ -77,7 +77,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
 
 ; FUNC-LABEL: @v_test_umin3_2_uses
 ; SI-NOT: v_min3
-define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 8e1b0036a1a..83c2a911a5c 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -15,7 +15,7 @@
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 85dfbe6b8a3..e1fb00a1de3 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 1a0a3902785..417b4ba802e 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
@@ -31,7 +31,7 @@ exit:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
-define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf.ll b/llvm/test/CodeGen/AMDGPU/mubuf.ll
index a574365da98..9e1d2e0490c 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
   %1 = load i32, i32 addrspace(1)* %0
@@ -20,7 +20,7 @@ entry:
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
   %1 = load i8, i8 addrspace(1)* %0
@@ -32,7 +32,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
   %1 = load i32, i32 addrspace(1)* %0
@@ -44,7 +44,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -91,7 +91,7 @@ main_body:
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define void @mubuf_store0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 0, i32 addrspace(1)* %0
@@ -102,7 +102,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store1:
 ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
-define void @mubuf_store1(i8 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
   store i8 0, i8 addrspace(1)* %0
@@ -113,7 +113,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define void @mubuf_store2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
   store i32 0, i32 addrspace(1)* %0
@@ -124,7 +124,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -134,14 +134,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -150,7 +150,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -159,7 +159,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
   ret void
@@ -167,7 +167,7 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 5215838ac1d..a72a6efb071 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -11,7 +11,7 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; SI: s_load_dword
 ; SI: s_mul_i32
 ; SI: buffer_store_dword
-define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -57,7 +57,7 @@ define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; SI: s_load_dword
 ; SI: v_mul_lo_i32
 ; SI: buffer_store_dword
-define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
@@ -73,7 +73,7 @@ define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %a
 ; EG-DAG: MULHI_INT
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_i32
-define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
   %1 = mul i64 %0, 80
@@ -87,7 +87,7 @@ entry:
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -99,7 +99,7 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI: s_endpgm
-define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
@@ -114,7 +114,7 @@ define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -122,7 +122,7 @@ define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -139,7 +139,7 @@ define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; crash with a 'failed to select' error.
 
 ; FUNC-LABEL: {{^}}s_mul_i64:
-define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %mul = mul i64 %a, %b
   store i64 %mul, i64 addrspace(1)* %out, align 8
   ret void
@@ -147,7 +147,7 @@ define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; SI: v_mul_lo_i32
-define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %a, %b
@@ -157,7 +157,7 @@ define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 
 ; FUNC-LABEL: {{^}}mul32_in_branch:
 ; SI: s_mul_i32
-define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -180,7 +180,7 @@ endif:
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 ; SI: s_endpgm
-define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -224,7 +224,7 @@ endif:
 ; SI: s_mul_i32
 
 ; SI: buffer_store_dwordx4
-define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
@@ -253,7 +253,7 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
 ; SI-DAG: v_mul_lo_i32
 
 ; SI: {{buffer|flat}}_store_dwordx4
-define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
index 6f7dfe2e13e..3137569e9ca 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll
@@ -13,7 +13,7 @@
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -39,7 +39,7 @@ entry:
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
-define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -70,7 +70,7 @@ entry:
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -87,7 +87,7 @@ define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -112,7 +112,7 @@ define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
+define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
 entry:
   %a.shl = shl i33 %a, 9
   %a.24 = ashr i33 %a.shl, 9
@@ -133,7 +133,7 @@ entry:
 ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
 ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; SI-NEXT: buffer_store_dword v[[HI]]
-define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = ashr i33 %tmp0, 9
@@ -151,7 +151,7 @@ entry:
 ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
+define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
 bb:
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb11, label %bb7
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 004d36f00e5..59fdc8be5ce 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; GCN: v_mul_u32_u24
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -22,7 +22,7 @@ entry:
 ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
 ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -34,7 +34,7 @@ entry:
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -54,7 +54,7 @@ define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: s_mul_i32
 ; VI: s_and_b32
 ; VI: v_mov_b32_e32
-define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = zext i16 %mul to i32
@@ -66,7 +66,7 @@ entry:
 ; SI: v_mul_u32_u24_e32
 ; SI: v_and_b32_e32
 ; VI: v_mul_lo_u16
-define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -83,7 +83,7 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in)
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
+define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -101,7 +101,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -118,7 +118,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -136,7 +136,7 @@ entry:
 ; GCN-DAG: v_mul_u32_u24_e32
 ; GCN-DAG: v_mul_hi_u32_u24_e32
 ; GCN: buffer_store_dwordx2
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
@@ -152,7 +152,7 @@ entry:
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
@@ -166,7 +166,7 @@ entry:
 ; GCN: s_and_b32
 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
-define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.16 = and i32 %a, 65535
   %b.16 = and i32 %b, 65535
@@ -186,7 +186,7 @@ entry:
 ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
-define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
@@ -206,7 +206,7 @@ entry:
 ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN-NEXT: buffer_store_dword v[[HI]]
-define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
diff --git a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
index da1c111fa5c..0a646b7126d 100644
--- a/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -19,7 +19,7 @@ entry:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -31,7 +31,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i8:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
   %mul = mul i8 %a, %b
   %ext = sext i8 %mul to i32
@@ -41,7 +41,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -56,7 +56,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -71,7 +71,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 95c7ce86232..15de689b953 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -64,7 +64,7 @@ ENDIF:                                            ; preds = %LOOP
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }
 
-; OPT-LABEL: define void @multi_if_break_loop(
+; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
 ; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
@@ -79,7 +79,7 @@ ENDIF:                                            ; preds = %LOOP
 ; Uses a copy intsead of an or
 ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
 ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
-define void @multi_if_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9dd99efd997..97dc67f8260 100644
--- a/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -9,7 +9,7 @@
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
 ; CHECK-DAG: Name: load_extern_const_init
-define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
 ; CHECK-DAG: Name: undef_const_addrspace
-define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index fd66b0b5d1f..8a7bf6db5b8 100644
--- a/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
 ; SI: s_load_dword s
 ; SI: buffer_store_short v
-define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i16
   store i16 %trunc, i16 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
-define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -34,7 +34,7 @@ define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspa
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -56,7 +56,7 @@ define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i1
   store i1 %trunc, i1 addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
@@ -78,7 +78,7 @@ define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -100,7 +100,7 @@ define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspa
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)
 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
   %trunc = trunc i16 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -134,7 +134,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -147,7 +147,7 @@ define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -171,7 +171,7 @@ define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)*
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -194,7 +194,7 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %val = load i32, i32 addrspace(2)* %in
   %mask = and i32 %val, 65535
@@ -205,7 +205,7 @@ entry:
 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %hi = lshr i64 %bc, 32
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index 0242f6d6145..c974471c657 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -6,7 +6,7 @@
 
 ; EG: CF_END
 ; SI: s_endpgm
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/operand-folding.ll b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
index 4e5ea4b86b7..3836a2b7e59 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-folding.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
-define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
   br i1 %tmp0, label %if, label %endif
@@ -20,7 +20,7 @@ endif:
 
 ; CHECK-LABEL: {{^}}fold_imm:
 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
   %tmp0 = icmp ne i32 %cmp, 0
@@ -46,7 +46,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
 
-define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
 entry:
   %tmp0 = add i64 %val, 1
   store i64 %tmp0, i64 addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 
-define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -80,7 +80,7 @@ entry:
 ; CHECK-LABEL: {{^}}imm_one_use:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
 
-define void @imm_one_use(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
@@ -94,7 +94,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 
-define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -114,7 +114,7 @@ entry:
 ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
-define void @no_fold_tied_subregister() {
+define amdgpu_kernel void @no_fold_tied_subregister() {
   %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
index 127f3da220e..fc6f070b737 100644
--- a/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/llvm/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -11,7 +11,7 @@
 ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 4584802ad5a..2de6b59e59e 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -3,7 +3,7 @@
 --- |
   target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
-  define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -23,7 +23,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -34,7 +34,7 @@
     ret void
   }
 
-  define void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -46,7 +46,7 @@
   }
 
 
-  define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -67,7 +67,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -78,7 +78,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -89,7 +89,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -100,7 +100,7 @@
     ret void
   }
 
-  define void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -111,7 +111,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -122,7 +122,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll
index eca6909d4eb..eb082843fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/or.ll
@@ -9,7 +9,7 @@
 
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -28,7 +28,7 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -39,7 +39,7 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
-define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32, i32 addrspace(1)* %a
   %or = or i32 %loada, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
-define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
@@ -68,7 +68,7 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -101,7 +101,7 @@ define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -111,7 +111,7 @@ define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
 ; SI-NOT: or_b32
-define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   %foo = add i64 %b, 63
@@ -125,7 +125,7 @@ define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a,
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 65535
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -142,7 +142,7 @@ define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a,
 
 ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
-define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 4
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -154,7 +154,7 @@ define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspac
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
 
 ; SI: s_or_b64
-define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %loadb = load i64, i64 addrspace(1)* %b, align 8
   %or = or i64 %loada, %loadb
@@ -174,7 +174,7 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64, i64 addrspace(1)* %a
   %or = or i64 %loada, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
@@ -200,7 +200,7 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
   store i64 %or, i64 addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -226,7 +226,7 @@ define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -200
   store i64 %or, i64 addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -250,7 +250,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -263,7 +263,7 @@ define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrs
 
 ; FUNC-LABEL: {{^}}s_or_i1:
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %or = or i1 %cmp0, %cmp1
diff --git a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll b/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
index 32ad9aba04e..57777e783c5 100644
--- a/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -6,7 +6,7 @@
 
 @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
 
-define void @use_huge_lds() {
+define amdgpu_kernel void @use_huge_lds() {
 entry:
   %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
   store i32 0, i32 addrspace(3)* %v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index 10cc54decf0..b8621562713 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -9,7 +9,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
   %val0 = load volatile i32, i32 addrspace(2)* %in0
   %val1 = load volatile i32, i32 addrspace(2)* %in1
   %lo.i = trunc i32 %val0 to i16
@@ -28,7 +28,7 @@ define void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
   %val1 = load i32, i32 addrspace(2)* %in1
   %hi.i = trunc i32 %val1 to i16
   %hi = bitcast i16 %hi.i to half
@@ -44,7 +44,7 @@ define void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
   %val0 = load i32, i32 addrspace(2)* %in0
   %lo.i = trunc i32 %val0 to i16
   %lo = bitcast i16 %lo.i to half
@@ -64,7 +64,7 @@ define void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
 ; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -91,7 +91,7 @@ define void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
 
 ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
-define void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -118,7 +118,7 @@ define void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #
 ; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}}
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -140,7 +140,7 @@ define void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -164,7 +164,7 @@ define void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -187,7 +187,7 @@ define void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -209,7 +209,7 @@ define void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index bedf79995e5..9ffd16754a1 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -9,7 +9,7 @@
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
   %val0 = load volatile i32, i32 addrspace(2)* %in0
   %val1 = load volatile i32, i32 addrspace(2)* %in1
   %lo = trunc i32 %val0 to i16
@@ -26,7 +26,7 @@ define void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
   %val1 = load i32, i32 addrspace(2)* %in1
   %hi = trunc i32 %val1 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
@@ -41,7 +41,7 @@ define void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
 ; GFX9: ; use [[PACKED]]
-define void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
   %val0 = load i32, i32 addrspace(2)* %in0
   %lo = trunc i32 %val0 to i16
   %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
@@ -60,7 +60,7 @@ define void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
 ; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -85,7 +85,7 @@ define void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
 
 ; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
-define void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -111,7 +111,7 @@ define void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -130,7 +130,7 @@ define void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
 
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
@@ -151,7 +151,7 @@ define void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]]
 
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
@@ -169,7 +169,7 @@ define void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
 ; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 7
 ; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]]
 ; GFX9: ; use [[PACKED]]
-define void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/packetizer.ll b/llvm/test/CodeGen/AMDGPU/packetizer.ll
index 49a7c0df748..1764d64c367 100644
--- a/llvm/test/CodeGen/AMDGPU/packetizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/packetizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
 ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
 
-define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index ea943a533c8..a90f200f79e 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -11,7 +11,7 @@
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 1da1e91b8ab..91116b0f65e 100644
--- a/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/llvm/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -12,7 +12,7 @@
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
 ; CHECK-NEXT: OR_INT
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 3e0d36978ad..4bcfe5f3d28 100644
--- a/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/llvm/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -10,7 +10,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
diff --git a/llvm/test/CodeGen/AMDGPU/predicates.ll b/llvm/test/CodeGen/AMDGPU/predicates.ll
index c1af815c7b1..566b48eb886 100644
--- a/llvm/test/CodeGen/AMDGPU/predicates.ll
+++ b/llvm/test/CodeGen/AMDGPU/predicates.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %cmp0 = icmp sgt i32 %in, 0
   br i1 %cmp0, label %IF, label %ENDIF
@@ -25,7 +25,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF, label %ELSE
@@ -51,7 +51,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -79,7 +79,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 2894730eccb..af268351029 100644
--- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -18,7 +18,7 @@
 
 ; OPTNONE-NOT: s_mov_b32
 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
-define void @store_to_undef() #0 {
+define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
 }
@@ -28,7 +28,7 @@ define void @store_to_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @store_to_inttoptr() #0 {
+define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
@@ -38,7 +38,7 @@ define void @store_to_inttoptr() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_undef() #0 {
+define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
@@ -48,7 +48,7 @@ define void @load_from_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_inttoptr() #0 {
+define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
index 9e75cc2bae4..f8054307970 100644
--- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll
@@ -36,7 +36,7 @@
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -106,7 +106,7 @@ entry:
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
-define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -143,7 +143,7 @@ entry:
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -179,7 +179,7 @@ entry:
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -228,7 +228,7 @@ entry:
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index eea10c86223..9fa3051928a 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -4,7 +4,7 @@
 ; This works because promote allocas pass replaces these with LDS atomics.
 
 ; Private atomics have no real use, but at least shouldn't crash on it.
-define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -17,7 +17,7 @@ entry:
   ret void
 }
 
-define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
index 8ba0b70dbdb..9b5f655f1b5 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -7,7 +7,7 @@
 
 declare i32 @foo(i32*) nounwind
 
-define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
index 3e1796959aa..d07a0a02cba 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
 
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -47,7 +47,7 @@ entry:
 ; R600-NOT: MOVA_INT
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}direct_loop:
 ; R600-NOT: MOVA_INT
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -110,7 +110,7 @@ for.end:
 ; FUNC-LABEL: {{^}}short_array:
 
 ; R600: MOVA_INT
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}char_array:
 
 ; R600: MOVA_INT
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -148,7 +148,7 @@ entry:
 ; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
@@ -169,7 +169,7 @@ entry:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -193,7 +193,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -207,7 +207,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -220,7 +220,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -235,7 +235,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -248,7 +248,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -261,7 +261,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -282,7 +282,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
index 3bd0aecf7aa..41a68b18b0a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: @array_alloca(
 ; CHECK: %stack = alloca i32, i32 5, align 4
-define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca i32, i32 5, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
@@ -27,7 +27,7 @@ entry:
 
 ; CHECK-LABEL: @array_alloca_dynamic(
 ; CHECK: %stack = alloca i32, i32 %size, align 4
-define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
 entry:
   %stack = alloca i32, i32 %size, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 82030f377d9..a5eb92de9e4 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -7,14 +7,14 @@ declare void @foo(float*) #0
 declare void @foo.varargs(...) #0
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
-define void @crash_call_constexpr_cast() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32
   call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
   ret void
 }
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
-define void @crash_call_constexpr_cast_varargs() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32
   call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index eb0d0cc6269..38db51d4c8c 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -5,12 +5,12 @@
 @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
-; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [10 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index 4c3c15dac0d..f83eb56dc6e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -7,7 +7,7 @@ declare i8* @llvm.invariant.group.barrier(i8*) #1
 ; GCN-LABEL: {{^}}use_invariant_promotable_lds:
 ; GCN: buffer_load_dword
 ; GCN: ds_write_b32
-define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
index eeda19fa27a..9d6f78f808e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -7,7 +7,7 @@ declare void @llvm.lifetime.end(i64, i8* nocapture) #0
 ; OPT-NOT: alloca i32
 ; OPT-NOT: llvm.lifetime
 ; OPT: store i32 %tmp3, i32 addrspace(3)*
-define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index ce50f1b1005..7a4a451ff36 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) #1
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -28,7 +28,7 @@ define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -41,7 +41,7 @@ define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK-LABEL: @promote_with_memset(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
-define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -53,7 +53,7 @@ define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK-LABEL: @promote_with_objectsize(
 ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false)
-define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 8ba849e5f88..9f22f207179 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -5,7 +5,7 @@
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
 ; NOOPTS-NOT ds_write
 ; OPTS: ds_write
-define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -21,7 +21,7 @@ entry:
 ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
 ; ALL: workgroup_group_segment_byte_size = 0{{$}}
 ; ALL-NOT ds_write
-define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index 468a789e4a6..bf3bc493a4b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
 ; GCN: workgroup_group_segment_byte_size = 2340
-define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -62,7 +62,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
 ; GCN: workgroup_group_segment_byte_size = 2352
-define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -100,7 +100,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 3bcbb4f986b..03ce116cfca 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -14,7 +14,7 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 0.0, float *%tmp0
@@ -29,7 +29,7 @@ define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
 bb:
   %tmp = alloca float, i32 16
   store float 0.0, float* %tmp
@@ -46,7 +46,7 @@ bb:
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
   %tmp0 = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
@@ -64,7 +64,7 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-NOT: ds_
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
   store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
   %bitcast = bitcast i32** %tmp to i32*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 2e7527dbdbc..ebef6122990 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,7 +8,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -22,7 +22,7 @@ define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %ou
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
-define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* %ptr0, null
@@ -35,7 +35,7 @@ define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
-define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* null, %ptr0
@@ -49,7 +49,7 @@ define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
 ; CHECK: %ptr1 = call i32* @get_unknown_pointer()
 ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = call i32* @get_unknown_pointer()
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index 0462a351c39..d196897d67d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -13,7 +13,7 @@
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -34,7 +34,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
-define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -51,7 +51,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
-define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -73,7 +73,7 @@ endif:
 ; CHECK: br label %exit
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @one_phi_value(i32 %a) #0 {
+define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
@@ -97,7 +97,7 @@ exit:
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32* %phi.ptr, align 4
-define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -134,7 +134,7 @@ endif:
 ; CHECK-LABEL: @ptr_induction_var_same_alloca(
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
-define void @ptr_induction_var_same_alloca() #0 {
+define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
@@ -172,7 +172,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
 ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
-define void @ptr_induction_var_alloca_unknown() #0 {
+define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 34d274df738..55c2229fb6b 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
 ; CHECK: select i1 undef, i32* undef, i32* %alloca
-define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
   %alloca = alloca i32, align 4
   %select = select i1 undef, i32* undef, i32* %alloca
   store i32 0, i32* %select, align 4
@@ -16,7 +16,7 @@ define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -33,7 +33,7 @@ define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
 ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
-define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
   %alloca0 = alloca i32, i32 16, align 4
   %alloca1 = alloca i32, i32 16, align 4
   %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
@@ -50,7 +50,7 @@ define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
@@ -67,7 +67,7 @@ define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
 ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
-define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -78,7 +78,7 @@ define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0
   ret void
 }
 
-define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
@@ -102,7 +102,7 @@ bb2:
 ; CHECK-LABEL: @select_null_rhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
-define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
@@ -117,7 +117,7 @@ bb:
 ; CHECK-LABEL: @select_null_lhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
-define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
index e331731f90f..88c0e911662 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -8,7 +8,7 @@ declare void @llvm.stackrestore(i8*) #2
 ; CHECK-LABEL: @try_promote_unhandled_intrinsic(
 ; CHECK: alloca
 ; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index 626ff1e1e13..9c43a6dc60f 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @volatile_load(
 ; CHECK: alloca [5 x i32]
 ; CHECK: load volatile i32, i32*
-define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -16,7 +16,7 @@ entry:
 ; CHECK-LABEL: @volatile_store(
 ; CHECK: alloca [5 x i32]
 ; CHECK: store volatile i32 %tmp, i32*
-define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -30,7 +30,7 @@ entry:
 ; CHECK: alloca double
 ; CHECK: load double
 ; CHECK: load volatile double
-define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
index 866a4a9191e..b7ed34bbf09 100644
--- a/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -2,7 +2,7 @@
 ; Don't crash
 
 ; CHECK: MAX_UINT
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 bb:
   store i64 2, i64 addrspace(1)* %out
   %tmp = load i64, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll b/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
index c5b89343616..2604ed4e574 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.alu-limits.ll
@@ -6,7 +6,7 @@
 
 %struct.foo = type {i32, i32, i32}
 
-define void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
+define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
 entry:
   %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2
   %x = load i32, i32 *%ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
index 49441ee8d18..acf7a66a357 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -8,7 +8,7 @@
 ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -21,7 +21,7 @@ entry:
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %load to <2 x i16>
   store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
@@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to float
   store float %bc, float addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -69,7 +69,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16>  addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -85,7 +85,7 @@ define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; EG: VTX_READ_16
 ; EG-DAG: BFE_UINT
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
   %load = load <4 x i16>, <4 x i16>  addrspace(1)* %in, align 2
   %bc = bitcast <4 x i16> %load to <8 x i8>
   %element = extractelement <8 x i8> %bc, i32 5
@@ -98,7 +98,7 @@ define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bc = bitcast <2 x i32> %val to double
   store double %bc, double addrspace(1)* %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
index 7047c635dff..1ddc41feb00 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.global_atomics.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -16,7 +16,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -27,7 +27,7 @@ entry:
 ; FIXME: looks like the offset is wrong
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -38,7 +38,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -49,7 +49,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -58,7 +58,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -68,7 +68,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -78,7 +78,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -89,7 +89,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -98,7 +98,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -108,7 +108,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -118,7 +118,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -129,7 +129,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -138,7 +138,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -148,7 +148,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -158,7 +158,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -169,7 +169,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -178,7 +178,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -188,7 +188,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -198,7 +198,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -209,7 +209,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -218,7 +218,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -228,7 +228,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -238,7 +238,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -249,7 +249,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -278,7 +278,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -289,7 +289,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -308,7 +308,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -318,7 +318,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -329,7 +329,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -338,7 +338,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -348,7 +348,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -358,7 +358,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -369,7 +369,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -378,7 +378,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -388,7 +388,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -398,7 +398,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -409,7 +409,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -418,7 +418,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -428,7 +428,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -438,7 +438,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -449,7 +449,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -458,7 +458,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -468,7 +468,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -478,7 +478,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -487,7 +487,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -498,7 +498,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -507,7 +507,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_inc_add
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst
@@ -516,7 +516,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_dec_add
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -525,7 +525,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_inc_sub
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -534,7 +534,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_dec_sub
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll b/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
index f406c160cbb..53ee214f07e 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
 
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
index a34a48e3b7b..9eee9a6effc 100644
--- a/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
-define void @tgid_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
-define void @tgid_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
-define void @tgid_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
-define void @tidig_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
-define void @tidig_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
-define void @tidig_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_implicit:
 ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
@@ -69,7 +69,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; FUNC-LABEL: {{^}}test_implicit_dyn:
 ; 36 prepended implicit bytes + 8(out pointer + in) = 44
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
-define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 0aa11f2540f..fbdaeb82929 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -33,7 +33,7 @@ define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv fast float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@ define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv arcp float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -57,7 +57,7 @@ define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -69,7 +69,7 @@ define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src)
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %rcp = fdiv float 1.0, %src.fabs
   store float %rcp, float addrspace(1)* %out, align 4
@@ -82,7 +82,7 @@ define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float -1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -92,7 +92,7 @@ define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
-define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -106,7 +106,7 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -120,7 +120,7 @@ define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %sr
 ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, 2.0
   store float %rcp, float addrspace(1)* %out, align 4
@@ -130,7 +130,7 @@ define void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
 ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, 10.0
   store float %rcp, float addrspace(1)* %out, align 4
@@ -140,7 +140,7 @@ define void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
 ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32:
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[MUL]]
-define void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
   %x = load float, float addrspace(1)* undef
   %rcp = fdiv arcp float %x, -10.0
   store float %rcp, float addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index a5581d73cb2..34cbe396336 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index 2617ad7402f..6417d28e7aa 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index dcde8a1894f..8e248fdfea4 100644
--- a/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -4,7 +4,7 @@
 
 declare i64 @llvm.read_register.i64(metadata) #0
 
-define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !0)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/read_register.ll b/llvm/test/CodeGen/AMDGPU/read_register.ll
index 601a0adb812..8fe9e7f3f11 100644
--- a/llvm/test/CodeGen/AMDGPU/read_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/read_register.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.read_register.i64(metadata) #0
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @test_read_m0(i32 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @test_read_exec(i64 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
   store i64 %flat_scratch, i64 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
   %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
   store i32 %flat_scratch_lo, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
   %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
   store i32 %flat_scratch_hi, i32 addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
   %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
   store i32 %exec_lo, i32 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
   %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
   store i32 %exec_hi, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
index 7965b061fe5..5c698c839fa 100644
--- a/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/llvm/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -13,7 +13,7 @@ declare i64 @llvm.readcyclecounter() #0
 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: store_dwordx2
-define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index dd67dc488db..ecb513cd80b 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -6,7 +6,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -16,7 +16,7 @@ define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 ad
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
@@ -27,7 +27,7 @@ define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 a
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 281e49f804c..601aca48e1e 100644
--- a/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out,
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <8 x i16>
   store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
   ret void
@@ -22,7 +22,7 @@ define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out,
 ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <2 x i64>
   store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
   ret void
@@ -44,7 +44,7 @@ define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out,
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
   store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 90964485075..9f8667d3599 100644
--- a/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -6,7 +6,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
 bb:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp0 = icmp eq i32 %id.x, 0
diff --git a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
index de52cc0b1aa..ecf94b5772f 100644
--- a/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
+++ b/llvm/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -8,7 +8,7 @@
 # CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
 
 --- |
-  define void @test(i32 addrspace(1)* %out) { ret void }
+  define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
   
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
   !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
diff --git a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
index bff3a9f5d2b..26a76cf2041 100644
--- a/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/llvm/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
-define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
@@ -24,7 +24,7 @@ define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 a
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
   store i32 %x, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 47bdfba9653..5d4955aa1ce 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -3,7 +3,7 @@
 ; definition on every path (there should at least be IMPLICIT_DEF instructions).
 target triple = "amdgcn--"
 
-define void @func() {
+define amdgpu_kernel void @func() {
 B0:
   br i1 undef, label %B1, label %B2
 
diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index b928bc7086b..fc2e4426ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 # In the test below we have two independent def+use pairs of subregister1 which
diff --git a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
index 412202fa5d5..ff4069226a6 100644
--- a/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
@@ -19,7 +19,7 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 ; SI: ds_read2_b64
 ; SI: ds_write2_b64
 ; SI: s_endpgm
-define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
@@ -39,7 +39,7 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
   store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
@@ -54,7 +54,7 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no
 ; SI-NOT: ds_read
 ; SI: ds_write_b64
 ; SI: s_endpgm
-define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
index b60c470de97..266490718dd 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -7,7 +7,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH: s_or_b64
 ; BOTH: s_endpgm
-define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
   %1 = sub i64 64, %y
@@ -26,7 +26,7 @@ entry:
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
 ; BOTH: s_endpgm
-define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/rotl.ll b/llvm/test/CodeGen/AMDGPU/rotl.ll
index 7d2b5538ca3..c4bc8cdaabf 100644
--- a/llvm/test/CodeGen/AMDGPU/rotl.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotl.ll
@@ -10,7 +10,7 @@
 ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
 ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
 ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
   %1 = sub i32 32, %y
@@ -26,7 +26,7 @@ entry:
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
   %1 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -46,7 +46,7 @@ entry:
 ; SI-DAG: s_sub_i32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
   %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
index 58a1efe0807..9eda479cd25 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -6,7 +6,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH-DAG: s_lshl_b64
 ; BOTH: s_or_b64
-define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
@@ -24,7 +24,7 @@ entry:
 ; VI-DAG: v_lshlrev_b64
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
-define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
@@ -37,7 +37,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
@@ -48,7 +48,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
   %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll
index 55d180077cc..b4e2c2b67ce 100644
--- a/llvm/test/CodeGen/AMDGPU/rotr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rotr.ll
@@ -6,7 +6,7 @@
 ; R600: BIT_ALIGN_INT
 
 ; SI: v_alignbit_b32
-define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
   %tmp1 = shl i32 %x, %tmp0
@@ -22,7 +22,7 @@ entry:
 
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
   %tmp1 = shl <2 x i32> %x, %tmp0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %tmp1 = shl <4 x i32> %x, %tmp0
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.ll b/llvm/test/CodeGen/AMDGPU/rsq.ll
index 699440c3efb..9462683efe0 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.ll
@@ -8,7 +8,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 ; SI-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32
 ; SI: s_endpgm
-define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
@@ -20,7 +20,7 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
 ; SI-UNSAFE: v_rsq_f64_e32
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI: s_endpgm
-define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
   %div = fdiv double 1.0, %sqrt
@@ -31,7 +31,7 @@ define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noa
 ; SI-LABEL: {{^}}rsq_f32_sgpr:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; SI: s_endpgm
-define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
   store float %div, float addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
 ; SI-SAFE-NOT: v_rsq_f32
 
 ; SI: s_endpgm
-define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -81,7 +81,7 @@ define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
   %div = fdiv float -1.0, %sqrt
@@ -96,7 +96,7 @@ define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* n
 ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val)
   %div = fdiv double -1.0, %sqrt
@@ -112,7 +112,7 @@ define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)*
 ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
@@ -128,7 +128,7 @@ define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %val.fneg = fsub double -0.0, %val
   %sqrt = call double @llvm.sqrt.f64(double %val.fneg)
diff --git a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
index acceb3272fc..deef24cea37 100644
--- a/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65
   %add1 = add i32 %b, 65
   store i32 %add0, i32 addrspace(1)* %out0
@@ -30,7 +30,7 @@ define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; SI-LABEL: {{^}}s_addk_i32_k1:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32767 ; (1 << 15) - 1
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_addk_i32_k2:
 ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
-define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -17
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_addk_i32_k3:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
 ; SI: s_endpgm
-define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
   %add = add <2 x i32> %b, <i32 65, i32 66>
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@ define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
 ; SI: s_endpgm
-define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
   %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
   ret void
@@ -86,7 +86,7 @@ define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
 ; SI: s_endpgm
-define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
   %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
   store <8 x i32> %add, <8 x i32> addrspace(1)* %out
   ret void
@@ -95,7 +95,7 @@ define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
 ; SI-LABEL: {{^}}no_s_addk_i32_k0:
 ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
 ; SI: s_endpgm
-define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32768 ; 1 << 15
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_addk_i32:
 ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = add i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
index 0164c45083a..a131aaa3dfb 100644
--- a/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -49,7 +49,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -63,7 +63,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -107,7 +107,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -122,7 +122,7 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
@@ -167,7 +167,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
@@ -182,7 +182,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index e83b368cc1c..f6ed5408ba4 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 65
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k1:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k2:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, -17
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}no_s_mulk_i32_k0:
 ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
 ; SI: s_endpgm
-define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -44,7 +44,7 @@ define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_mulk_i32:
 ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 53448340163..f7a1c65881d 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -18,7 +18,7 @@ define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
-define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -34,7 +34,7 @@ define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -51,7 +51,7 @@ define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -68,7 +68,7 @@ define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -84,7 +84,7 @@ define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, i32 *undef
@@ -101,7 +101,7 @@ define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -119,7 +119,7 @@ define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, i32 *undef
@@ -136,7 +136,7 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -154,7 +154,7 @@ define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -173,7 +173,7 @@ define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -187,7 +187,7 @@ define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
@@ -204,7 +204,7 @@ define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c)
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -218,7 +218,7 @@ define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zero
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -234,7 +234,7 @@ define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -251,7 +251,7 @@ define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -269,7 +269,7 @@ define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a,
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index f8ced7942a6..586a455b2b9 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}saddo_i64_zext:
-define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -17,7 +17,7 @@ define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i32:
-define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
@@ -27,7 +27,7 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_saddo_i32:
-define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -39,7 +39,7 @@ define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i64:
-define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -51,7 +51,7 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_saddo_i64:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 14369a8953f..6e1dd163833 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
@@ -65,7 +65,7 @@ done:                                             ; preds = %loop
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
   br i1 %tmp, label %if, label %else
@@ -93,7 +93,7 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -113,7 +113,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
@@ -133,7 +133,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
@@ -155,7 +155,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
@@ -189,7 +189,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
@@ -230,7 +230,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
-define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
@@ -247,7 +247,7 @@ entry:
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -261,7 +261,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
-define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -275,7 +275,7 @@ entry:
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -290,7 +290,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -313,7 +313,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -350,7 +350,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -385,7 +385,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -441,7 +441,7 @@ entry:
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
   %tmp0 = bitcast i32 %cond to float
   %tmp1 = fadd float %tmp0, 2.500000e-01
@@ -459,7 +459,7 @@ bb7:                                              ; preds = %bb3
 
 ; GCN-LABEL: {{^}}phi_visit_order:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
-define void @phi_visit_order() {
+define amdgpu_kernel void @phi_visit_order() {
 bb:
   br label %bb1
 
@@ -484,7 +484,7 @@ bb4:
 ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
 ; GCN: s_xor_b32 [[B]], [[B]], [[A]]
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
-define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
 entry:
   br label %loop
 
diff --git a/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll b/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
index c41d345369b..4ea503bf609 100644
--- a/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/sampler-resource-id.ll
@@ -5,7 +5,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_0(i32 %in0, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -17,7 +17,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
index af71086e542..f4ef1c83fba 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
+++ b/llvm/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
@@ -1,23 +1,23 @@
 # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
 
 --- |
-  define void @basic_insert_dcache_wb() {
+  define amdgpu_kernel void @basic_insert_dcache_wb() {
     ret void
   }
 
-  define void @explicit_flush_after() {
+  define amdgpu_kernel void @explicit_flush_after() {
     ret void
   }
 
-  define void @explicit_flush_before() {
+  define amdgpu_kernel void @explicit_flush_before() {
     ret void
   }
 
-  define void @no_scalar_store() {
+  define amdgpu_kernel void @no_scalar_store() {
     ret void
   }
 
-  define void @multi_block_store() {
+  define amdgpu_kernel void @multi_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -28,7 +28,7 @@
     ret void
   }
 
-  define void @one_block_store() {
+  define amdgpu_kernel void @one_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 35886f85618..62d0d936788 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -9,7 +9,7 @@
 ; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
-define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -21,7 +21,7 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -33,7 +33,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ; to produce one, but for some reason never made it to selection.
 
 
-; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 ;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
@@ -42,7 +42,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
@@ -51,7 +51,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
@@ -59,7 +59,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
@@ -67,7 +67,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-define void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
   %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
   %bc = bitcast <4 x i8> %newvec0 to <2 x half>
   store <2 x half> %bc, <2 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index 5cf2c6c7675..177957c0b35 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
 ; REQUIRES: asserts
 
-define void @main() #0 {
+define amdgpu_kernel void @main() #0 {
 main_body:
   %tmp = load <4 x float>, <4 x float> addrspace(9)* null
   %tmp5 = extractelement <4 x float> %tmp, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 32c456bd2ce..44d46086f02 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
-define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
@@ -24,7 +24,7 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
 ; FUNC-LABEL: {{^}}same_base_ptr_crash:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
-define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   %tmp0 = load i32, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
index aa67b2e0f7d..964298a5531 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-if.ll b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
index 6637b389771..feac5d918f6 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index ccfde7b9adc..5c47c163dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -12,7 +12,7 @@
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
-define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
   ret void
@@ -26,7 +26,7 @@ define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: s_endpgm
-define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
     i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
     i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
index ab81a691de7..4520fe86136 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -5,7 +5,7 @@
 ; We expect a two digit VGPR usage here, not a three digit.
 ; CHECK: NumVgprs: {{[0-9][0-9]$}}
 
-define void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
   %tmp2 = load float, float addrspace(3)* %tmp, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
index 2e659fe5a85..0d19c1e6a8f 100644
--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -10,7 +10,7 @@
 ; VI: NumSgprs: {{[1-5][0-9]$}}
 ; VI: NumVgprs: {{[1-3][0-9]$}}
 
-define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
 bb:
   %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
   %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
index b4672904302..6b1e85915a1 100644
--- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -13,7 +13,7 @@
 ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -53,7 +53,7 @@ done:
 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -88,7 +88,7 @@ done:
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
-define void @neg_vaddr_offset(i32 %offset) {
+define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr_offset = add i32 %offset, 4
@@ -99,7 +99,7 @@ entry:
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20
-define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index bafd6a50ccf..f9ac425be79 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 
 ; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
-define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -23,7 +23,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}sdiv_i32_4:
-define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -43,14 +43,14 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_add_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -59,14 +59,14 @@ define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -75,7 +75,7 @@ define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -100,7 +100,7 @@ define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -114,7 +114,7 @@ define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -126,7 +126,7 @@ define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_sdiv_i25:
 ; SI-NOT: v_rcp_f32
-define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
   %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
   %num = load i25, i25 addrspace(1) * %in
   %den = load i25, i25 addrspace(1) * %den_ptr
@@ -137,19 +137,19 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 }
 
 ; Tests for 64-bit divide bypass.
-; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = srem i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %resultdiv = sdiv i64 %a, %b
 ;   %resultrem = srem i64 %a, %b
 ;   %result = add i64 %resultdiv, %resultrem
@@ -163,7 +163,7 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 ; SI: v_mul_hi_i32
 ; SI: v_mul_hi_i32
 
-define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
index 349a7821da1..257e6be96b6 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -69,7 +69,7 @@ define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -88,7 +88,7 @@ define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -107,7 +107,7 @@ define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -130,7 +130,7 @@ define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -149,7 +149,7 @@ define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -168,7 +168,7 @@ define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -187,7 +187,7 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -206,7 +206,7 @@ define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -225,7 +225,7 @@ define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -244,7 +244,7 @@ define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -263,7 +263,7 @@ define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -283,7 +283,7 @@ define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -303,7 +303,7 @@ define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -323,7 +323,7 @@ define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem64.ll b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
index 28fdb69e1ad..5ad0d8efaed 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -70,7 +70,7 @@
 ; SI-NOT: v_lshr_b64
 ; VI-NOT: v_lshrrev_b64
 ; GCN: s_endpgm
-define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -196,7 +196,7 @@ define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -216,7 +216,7 @@ define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index d9c5b4c818a..38bbd6d8c95 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -8,7 +8,7 @@
 
 ; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %a = load i32, i32 addrspace(1)* %in, align 4
   %shr = lshr i32 %a, 16
   %add = add i32 %a, %shr
@@ -23,7 +23,7 @@ define void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %a = load i32, i32 addrspace(1)* %in, align 4
   %shr = lshr i32 %a, 16
   %sub = sub i32 %shr, %a
@@ -39,7 +39,7 @@ define void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 
-define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
+define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
   %a = load i32, i32 addrspace(1)* %in1, align 4
   %b = load i32, i32 addrspace(1)* %in2, align 4
   %shra = lshr i32 %a, 16
@@ -55,7 +55,7 @@ define void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 add
 ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
 entry:
   %a = load i16, i16 addrspace(1)* %ina, align 4
   %b = load i16, i16 addrspace(1)* %inb, align 4
@@ -75,7 +75,7 @@ entry:
 ; SDWA: v_mul_u32_u24_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL]], v{{[0-9]+}}
 
-define void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
@@ -97,7 +97,7 @@ entry:
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL1]], v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL0]], v{{[0-9]+}}
 
-define void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
@@ -123,7 +123,7 @@ entry:
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL3]], v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL2]], v{{[0-9]+}}
 
-define void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
   %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
@@ -138,7 +138,7 @@ entry:
 ; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_f16_sdwa
 
-define void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
 entry:
   %a = load half, half addrspace(1)* %ina, align 4
   %b = load half, half addrspace(1)* %inb, align 4
@@ -157,7 +157,7 @@ entry:
 
 ; SDWA: v_mul_f16_sdwa v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 
-define void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
   %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
   %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
@@ -178,7 +178,7 @@ entry:
 ; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
 entry:
   %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
   %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
@@ -204,7 +204,7 @@ entry:
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
-define void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
 entry:
   %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
   %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
@@ -219,7 +219,7 @@ entry:
 ; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
 entry:
   %a = load i8, i8 addrspace(1)* %ina, align 4
   %b = load i8, i8 addrspace(1)* %inb, align 4
@@ -238,7 +238,7 @@ entry:
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
 
-define void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
   %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
@@ -259,7 +259,7 @@ entry:
 ; SDWA: v_mul_u32_u24_sdwa
 ; SDWA: v_mul_u32_u24_sdwa
 
-define void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
   %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
@@ -283,7 +283,7 @@ entry:
 ; SDWA: v_mul_u32_u24_sdwa
 ; SDWA: v_mul_u32_u24_sdwa
 
-define void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
+define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
 entry:
   %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
   %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
@@ -304,7 +304,7 @@ entry:
 ; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
 
-define void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
 entry:
   %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
   %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
@@ -318,7 +318,7 @@ entry:
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 
-define void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %mul = mul <2 x i16> %a, <i16 123, i16 321>
@@ -337,7 +337,7 @@ entry:
 
 ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
-define void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
@@ -353,7 +353,7 @@ entry:
 ; SDWA-NOT: v_mul_u32_u24_sdwa
 ; SDWA-NOT: v_add_i32_sdwa
 
-define void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) {
+define amdgpu_kernel void @mul_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb, i1 addrspace(1)* %incond) {
 entry:
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index 559d464f36a..c8839c17365 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -29,7 +29,7 @@ define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 246e2bb045c..3417eb02b36 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -8,7 +8,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
-define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -30,7 +30,7 @@ define void @add_select_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
-define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -57,7 +57,7 @@ define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[X_ABS]]
-define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@ define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
-define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -104,7 +104,7 @@ define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -123,7 +123,7 @@ define void @add_select_fabs_var_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_fabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -140,7 +140,7 @@ define void @add_select_fabs_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
-define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -155,7 +155,7 @@ define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_posk_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float 2.0, float 1.0
@@ -172,7 +172,7 @@ define void @add_select_posk_posk_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -192,7 +192,7 @@ define void @add_select_negk_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -209,7 +209,7 @@ define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_fabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
 
@@ -228,7 +228,7 @@ define void @add_select_fabs_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_posk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -246,7 +246,7 @@ define void @add_select_posk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -268,7 +268,7 @@ define void @add_select_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
-define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -295,7 +295,7 @@ define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[NEG_X]]
-define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -318,7 +318,7 @@ define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
-define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -342,7 +342,7 @@ define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fneg_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -360,7 +360,7 @@ define void @add_select_fneg_var_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -378,7 +378,7 @@ define void @add_select_fneg_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -398,7 +398,7 @@ define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
 ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
 
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -415,7 +415,7 @@ define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -432,7 +432,7 @@ define void @add_select_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2048.0, float -4096.0
@@ -446,7 +446,7 @@ define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
-define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -463,7 +463,7 @@ define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_negk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -480,7 +480,7 @@ define void @add_select_negk_fneg_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -498,7 +498,7 @@ define void @add_select_fneg_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_posk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -518,7 +518,7 @@ define void @add_select_posk_fneg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -541,7 +541,7 @@ define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -564,7 +564,7 @@ define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_neg_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -586,7 +586,7 @@ define void @add_select_neg_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -607,7 +607,7 @@ define void @add_select_fabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -629,7 +629,7 @@ define void @add_select_neg_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -651,7 +651,7 @@ define void @add_select_negfabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -672,7 +672,7 @@ define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -690,7 +690,7 @@ define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -709,7 +709,7 @@ define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -732,7 +732,7 @@ define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -749,7 +749,7 @@ define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %add = fsub float %x, 4.0
@@ -765,7 +765,7 @@ define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = fmul float %x, 4.0
@@ -782,7 +782,7 @@ define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
 ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -799,7 +799,7 @@ define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -818,7 +818,7 @@ define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll
index 07dcb215338..5eaad1f363f 100644
--- a/llvm/test/CodeGen/AMDGPU/select-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}select_i1:
 ; SI: v_cndmask_b32
 ; SI-NOT: v_cndmask_b32
-define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
@@ -19,7 +19,7 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind
 ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll
index ad358d33c40..d56b952118b 100644
--- a/llvm/test/CodeGen/AMDGPU/select-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -27,7 +27,7 @@ define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -43,7 +43,7 @@ define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -59,7 +59,7 @@ define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -76,7 +76,7 @@ define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
-define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -92,7 +92,7 @@ define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -108,7 +108,7 @@ define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -124,7 +124,7 @@ define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -138,7 +138,7 @@ define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
   %cmp0 = fcmp oeq float %c0, 1.0
   br i1 %cmp0, label %if0, label %endif
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index 759abe2f2e9..8710fc8c730 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
-define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
@@ -64,7 +64,7 @@ define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -82,7 +82,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
@@ -102,7 +102,7 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
@@ -120,7 +120,7 @@ define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x
 ; SI: v_cndmask_b32_e32
 
 ; SI: buffer_store_dwordx4
-define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
@@ -135,7 +135,7 @@ define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -153,7 +153,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
@@ -165,7 +165,7 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
@@ -181,7 +181,7 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
@@ -205,7 +205,7 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll
index 050398c0323..ea578c554a6 100644
--- a/llvm/test/CodeGen/AMDGPU/select.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll
@@ -17,7 +17,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16(
+define amdgpu_kernel void @select_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -48,7 +48,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_a(
+define amdgpu_kernel void @select_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
@@ -78,7 +78,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_b(
+define amdgpu_kernel void @select_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c,
@@ -109,7 +109,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_c(
+define amdgpu_kernel void @select_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -139,7 +139,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_d(
+define amdgpu_kernel void @select_f16_imm_d(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -168,7 +168,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16(
+define amdgpu_kernel void @select_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -202,7 +202,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_a(
+define amdgpu_kernel void @select_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
@@ -235,7 +235,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_b(
+define amdgpu_kernel void @select_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c,
@@ -272,7 +272,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_c(
+define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -304,7 +304,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_d(
+define amdgpu_kernel void @select_v2f16_imm_d(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/select.ll b/llvm/test/CodeGen/AMDGPU/select.ll
index 45f3cd5a7ac..e53c159a2f7 100644
--- a/llvm/test/CodeGen/AMDGPU/select.ll
+++ b/llvm/test/CodeGen/AMDGPU/select.ll
@@ -14,7 +14,7 @@
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
                      i32 %cond) {
diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll
index a68fdecb00a..3b4c925a87a 100644
--- a/llvm/test/CodeGen/AMDGPU/select64.ll
+++ b/llvm/test/CodeGen/AMDGPU/select64.ll
@@ -7,7 +7,7 @@
 ; CHECK-NOT: s_lshr_b64
 ; CHECK: v_cndmask
 ; CHECK: v_cndmask
-define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
@@ -18,7 +18,7 @@ entry:
 ; CHECK-LABEL: {{^}}select_trunc_i64:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
@@ -29,7 +29,7 @@ define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwi
 ; CHECK-LABEL: {{^}}select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
@@ -40,7 +40,7 @@ define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %
 ; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -54,7 +54,7 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
 ; CHECK: s_endpgm
-define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
index 94d0ace7569..18616851c9c 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
-define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float, float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
index 58a4ee7d62b..1504165d3d2 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE_INT
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32, i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
index e870ee891e6..7af5478600b 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: -1
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
-define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %1 = icmp sge i32 %0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
index 0f46d4c7ea0..8fef3f8b380 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,7 +7,7 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
-define void @test_a(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -35,7 +35,7 @@ ENDIF:
 ; EG: SET{{[GTEQN]+}}_DX10
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
-define void @test_b(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -59,7 +59,7 @@ ENDIF:
 ; Test a CND*_INT instruction with float true/false values
 ; EG-LABEL: {{^}}test_c:
 ; EG: CND{{[GTE]+}}_INT
-define void @test_c(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   %1 = select i1 %0, float 2.0, float 3.0
@@ -72,7 +72,7 @@ entry:
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
-define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = select i1 %icmp0, i32 -1, i32 0
   store i32 %ext, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/selectcc.ll b/llvm/test/CodeGen/AMDGPU/selectcc.ll
index 446d4ab344b..7eca2291398 100644
--- a/llvm/test/CodeGen/AMDGPU/selectcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/selectcc.ll
@@ -11,7 +11,7 @@
 ; SI: v_cmp_eq_u64
 ; SI: v_cndmask
 ; SI: v_cndmask
-define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
   %1 = select i1 %0, i64 %true, i64 %false
diff --git a/llvm/test/CodeGen/AMDGPU/set-dx10.ll b/llvm/test/CodeGen/AMDGPU/set-dx10.ll
index 57365a6e1fc..6867c639493 100644
--- a/llvm/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/llvm/test/CodeGen/AMDGPU/set-dx10.ll
@@ -8,7 +8,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -22,7 +22,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -34,7 +34,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -48,7 +48,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -60,7 +60,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -74,7 +74,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -86,7 +86,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -100,7 +100,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -112,7 +112,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -126,7 +126,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -138,7 +138,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -152,7 +152,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
index 11ea793650c..853afa8772e 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -3,7 +3,7 @@
 ; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
-define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %cmp1 = icmp eq i32 %a, -1
   %cmp2 = icmp eq i32 %b, -1
   %and = and i1 %cmp1, %cmp2
@@ -20,7 +20,7 @@ define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; EG: SETE_INT
 ; EG: AND_INT
 ; EG: SETE_INT
-define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
   %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %and = and <4 x i1> %cmp1, %cmp2
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index d96b03f7330..8d455d84bf9 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -10,7 +10,7 @@
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -33,7 +33,7 @@ define void @multi_use_fneg_src() #0 {
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
 ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
-define void @multi_foldable_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -59,7 +59,7 @@ define void @multi_foldable_use_fneg_src() #0 {
 ; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
 ; GCN-NOT: xor
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg() #0 {
+define amdgpu_kernel void @multi_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -82,7 +82,7 @@ define void @multi_use_fneg() #0 {
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_foldable_use_fneg() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -101,7 +101,7 @@ define void @multi_foldable_use_fneg() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oeq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -114,7 +114,7 @@ define void @test_setcc_fneg_oeq_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ogt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -127,7 +127,7 @@ define void @test_setcc_fneg_ogt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -140,7 +140,7 @@ define void @test_setcc_fneg_oge_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_olt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -153,7 +153,7 @@ define void @test_setcc_fneg_olt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ole_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -166,7 +166,7 @@ define void @test_setcc_fneg_ole_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_one_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -179,7 +179,7 @@ define void @test_setcc_fneg_one_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ueq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -192,7 +192,7 @@ define void @test_setcc_fneg_ueq_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ugt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -205,7 +205,7 @@ define void @test_setcc_fneg_ugt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_uge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -218,7 +218,7 @@ define void @test_setcc_fneg_uge_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ult_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -231,7 +231,7 @@ define void @test_setcc_fneg_ult_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ule_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -244,7 +244,7 @@ define void @test_setcc_fneg_ule_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_une_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
diff --git a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
index 4ab6da08563..caddb6f6821 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -11,7 +11,7 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -28,7 +28,7 @@ define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -42,7 +42,7 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -56,7 +56,7 @@ define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -70,7 +70,7 @@ define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -84,7 +84,7 @@ define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -98,7 +98,7 @@ define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -111,7 +111,7 @@ define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_cmp_eq_u32_e32 vcc,
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
-define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -124,7 +124,7 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -137,7 +137,7 @@ define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -159,7 +159,7 @@ define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
   %b = load i8, i8 addrspace(1)* %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -186,7 +186,7 @@ define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nou
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -207,7 +207,7 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -218,7 +218,7 @@ define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -229,7 +229,7 @@ define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
@@ -241,7 +241,7 @@ define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 2
@@ -256,7 +256,7 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -267,7 +267,7 @@ define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -278,7 +278,7 @@ define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll
index 10d04bab9f6..add90e9c2f3 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; GCN-DAG: v_cmp_eq_u32_e32
 ; GCN-DAG: v_cmp_eq_u32_e64
-define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
   store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
-define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -43,7 +43,7 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
 ; GCN: v_cmp_eq_f32
-define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oeq float %a, %b
   %1 = sext i1 %0 to i32
@@ -54,7 +54,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_gt_f32
-define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ogt float %a, %b
   %1 = sext i1 %0 to i32
@@ -65,7 +65,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_ge_f32
-define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oge float %a, %b
   %1 = sext i1 %0 to i32
@@ -76,7 +76,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_lt_f32
-define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp olt float %a, %b
   %1 = sext i1 %0 to i32
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_le_f32
-define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ole float %a, %b
   %1 = sext i1 %0 to i32
@@ -105,7 +105,7 @@ entry:
 
 ; GCN: v_cmp_lg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp one float %a, %b
   %1 = sext i1 %0 to i32
@@ -119,7 +119,7 @@ entry:
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
 ; GCN: v_cmp_o_f32
-define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ord float %a, %b
   %1 = sext i1 %0 to i32
@@ -137,7 +137,7 @@ entry:
 
 ; GCN: v_cmp_nlg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ueq float %a, %b
   %1 = sext i1 %0 to i32
@@ -150,7 +150,7 @@ entry:
 ; R600: SETE_DX10
 ; GCN: v_cmp_nle_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ugt float %a, %b
   %1 = sext i1 %0 to i32
@@ -164,7 +164,7 @@ entry:
 
 ; GCN: v_cmp_nlt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uge float %a, %b
   %1 = sext i1 %0 to i32
@@ -178,7 +178,7 @@ entry:
 
 ; GCN: v_cmp_nge_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ult float %a, %b
   %1 = sext i1 %0 to i32
@@ -192,7 +192,7 @@ entry:
 
 ; GCN: v_cmp_ngt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ule float %a, %b
   %1 = sext i1 %0 to i32
@@ -203,7 +203,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
 ; GCN: v_cmp_neq_f32
-define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp une float %a, %b
   %1 = sext i1 %0 to i32
@@ -217,7 +217,7 @@ entry:
 ; R600: OR_INT
 ; R600: SETNE_INT
 ; GCN: v_cmp_u_f32
-define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uno float %a, %b
   %1 = sext i1 %0 to i32
@@ -232,7 +232,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
 ; GCN: v_cmp_eq_u32
-define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -243,7 +243,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
 ; GCN: v_cmp_ne_u32
-define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -254,7 +254,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_gt_u32
-define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -265,7 +265,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_ge_u32
-define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -276,7 +276,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_lt_u32
-define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -287,7 +287,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_le_u32
-define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_gt_i32
-define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -309,7 +309,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
 ; GCN: v_cmp_ge_i32
-define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -320,7 +320,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_lt_i32
-define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
 ; GCN: v_cmp_le_i32
-define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -348,7 +348,7 @@ entry:
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
@@ -369,7 +369,7 @@ define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptr
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
@@ -386,7 +386,7 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
 ; FUNC-LABEL: setcc-i1
 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
 ; GCN: s_cmp_eq_u32 [[AND]], 0
-define void @setcc-i1(i32 %in) #0 {
+define amdgpu_kernel void @setcc-i1(i32 %in) #0 {
   %and = and i32 %in, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %endif, label %if
@@ -400,7 +400,7 @@ endif:
 ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
 ; GCN: s_and_b64 s[2:3], [[A]], [[B]]
-define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
   %tmp7 = fcmp ole float %cond, 1.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/setcc64.ll b/llvm/test/CodeGen/AMDGPU/setcc64.ll
index 1f86277e0bc..1f1bdb05530 100644
--- a/llvm/test/CodeGen/AMDGPU/setcc64.ll
+++ b/llvm/test/CodeGen/AMDGPU/setcc64.ll
@@ -9,7 +9,7 @@
 
 ; GCN-LABEL: {{^}}f64_oeq:
 ; GCN: v_cmp_eq_f64
-define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oeq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -19,7 +19,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ogt:
 ; GCN: v_cmp_gt_f64
-define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ogt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_oge:
 ; GCN: v_cmp_ge_f64
-define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -39,7 +39,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_olt:
 ; GCN: v_cmp_lt_f64
-define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp olt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -49,7 +49,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ole:
 ; GCN: v_cmp_le_f64
-define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ole double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -60,7 +60,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_one:
 ; GCN: v_cmp_lg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp one double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -70,7 +70,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ord:
 ; GCN: v_cmp_o_f64
-define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ord double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -81,7 +81,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ueq:
 ; GCN: v_cmp_nlg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ueq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -93,7 +93,7 @@ entry:
 
 ; GCN: v_cmp_nle_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ugt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -104,7 +104,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_uge:
 ; GCN: v_cmp_nlt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -115,7 +115,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ult:
 ; GCN: v_cmp_nge_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ult double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -126,7 +126,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ule:
 ; GCN: v_cmp_ngt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ule double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -136,7 +136,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_une:
 ; GCN: v_cmp_neq_f64
-define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp une double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -146,7 +146,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_uno:
 ; GCN: v_cmp_u_f64
-define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uno double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -160,7 +160,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_eq:
 ; GCN: v_cmp_eq_u64
-define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -170,7 +170,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ne:
 ; GCN: v_cmp_ne_u64
-define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -180,7 +180,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ugt:
 ; GCN: v_cmp_gt_u64
-define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ugt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -190,7 +190,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_uge:
 ; GCN: v_cmp_ge_u64
-define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp uge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -200,7 +200,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ult:
 ; GCN: v_cmp_lt_u64
-define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ult i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -210,7 +210,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ule:
 ; GCN: v_cmp_le_u64
-define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ule i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -220,7 +220,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sgt:
 ; GCN: v_cmp_gt_i64
-define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sgt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -230,7 +230,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sge:
 ; GCN: v_cmp_ge_i64
-define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -240,7 +240,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_slt:
 ; GCN: v_cmp_lt_i64
-define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp slt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -250,7 +250,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sle:
 ; GCN: v_cmp_le_i64
-define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sle i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
diff --git a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
index 7dc6eb87f6b..0b780af17bc 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: SUB_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = add i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = sub i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index adba6bbb51d..7ac4e1d9fe4 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -11,7 +11,7 @@
 ; EG: LSHR {{\*?}} [[ADDR]]
 
 ; Works with the align 2 removed
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
index 015448cfdb3..82af20f08ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -15,7 +15,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: LSHR * [[ADDR]]
 ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
-define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
   store i32 %sext, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
   %ashr = ashr i32 %shl, 24
@@ -50,7 +50,7 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
   %ashr = ashr i32 %shl, 16
@@ -68,7 +68,7 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
   %ashr = ashr <1 x i32> %shl, <i32 24>
@@ -82,7 +82,7 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
@@ -96,7 +96,7 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
@@ -111,7 +111,7 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
-define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
@@ -125,7 +125,7 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
@@ -140,7 +140,7 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; XGCN: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
-; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
+; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
 ;   %c = add <1 x i64> %a, %b
 ;   %shl = shl <1 x i64> %c, <i64 56>
 ;   %ashr = ashr <1 x i64> %shl, <i64 56>
@@ -160,7 +160,7 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -187,7 +187,7 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -214,7 +214,7 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -238,7 +238,7 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -264,7 +264,7 @@ define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
   %y = ashr i32 %x, 7
@@ -287,7 +287,7 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -305,7 +305,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
   %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -326,7 +326,7 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
   %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -343,7 +343,7 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
   %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -364,7 +364,7 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
   %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -381,7 +381,7 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 16, i32 16>
   %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -390,7 +390,7 @@ define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 }
 
 ; FUNC-LABEL: {{^}}testcase:
-define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -402,7 +402,7 @@ define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
 }
 
 ; FUNC-LABEL: {{^}}testcase_3:
-define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -418,7 +418,7 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -431,7 +431,7 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -446,7 +446,7 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; GCN: v_max_i32
 ; GCN-NOT: bfe
 ; GCN: buffer_store_short
-define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
+define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
   %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
   %tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -462,7 +462,7 @@ declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
 ; FUNC-LABEL: {{^}}bfe_0_width:
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
   store i32 %bfe, i32 addrspace(1)* %out, align 4
@@ -473,7 +473,7 @@ define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; GCN: v_bfe_i32
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
@@ -484,7 +484,7 @@ define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; FUNC-LABEL: {{^}}bfe_8_bfe_16:
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN: s_endpgm
-define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
@@ -497,7 +497,7 @@ define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %load = load i32, i32 addrspace(1)* %ptr, align 4
   %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
   %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
@@ -510,7 +510,7 @@ define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
 ; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -520,7 +520,7 @@ define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #
 }
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
   %shl = shl i32 %bfe, 24
@@ -533,7 +533,7 @@ define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32
 ; GCN: buffer_load_sbyte
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
   %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
@@ -547,7 +547,7 @@ define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %pt
 ; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
 ; GCN-NOT: {{[^@]}}bfe
 ; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
   %load = load i8, i8 addrspace(1)* %ptr, align 1
   %sext = sext i8 %load to i32
   %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
@@ -562,7 +562,7 @@ define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %
 ; GCN-NOT: shl
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
 ; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 31
   %shr = ashr i32 %shl, 31
@@ -577,7 +577,7 @@ define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1
 ; GCN-NOT: shr
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
 ; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
@@ -593,7 +593,7 @@ define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
 ; GCN: s_endpgm
-define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %x = load i32, i32 addrspace(1)* %in, align 4
   %shl = shl i32 %x, 30
   %shr = ashr i32 %shl, 30
@@ -617,7 +617,7 @@ define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -647,7 +647,7 @@ define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrsp
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -673,7 +673,7 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 15
@@ -692,7 +692,7 @@ define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 14
@@ -706,7 +706,7 @@ define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
 
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
   %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
@@ -727,7 +727,7 @@ define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr
 
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -753,7 +753,7 @@ define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 14
   %sext = ashr i16 %shl, 14
   store i16 %sext, i16 addrspace(1)* %out
@@ -770,7 +770,7 @@ define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
   store i16 %sext, i16 addrspace(1)* %out
@@ -787,7 +787,7 @@ define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
 ; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
 ; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 1
   %sext = ashr i16 %shl, 1
   store i16 %sext, i16 addrspace(1)* %out
@@ -798,7 +798,7 @@ define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
-define void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 15, i16 15>
   %ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
@@ -813,7 +813,7 @@ define void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
-define void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
   %c = add <3 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
   %ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
@@ -825,7 +825,7 @@ define void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
-define void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 14, i16 14>
   %ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
@@ -837,7 +837,7 @@ define void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
 ; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
-define void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
   %c = add <2 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i16> %c, <i16 8, i16 8>
   %ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
@@ -852,7 +852,7 @@ define void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
-define void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
   %c = add <3 x i16> %a, %b ; add to prevent folding into extload
   %shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
   %ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index d5d2f6b717f..8e18ab5554e 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -13,7 +13,7 @@
 
 ; SI: s_sub
 
-define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -52,7 +52,7 @@ endif:
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -79,7 +79,7 @@ endif:
 ; SI: s_add_i32 [[SGPR:s[0-9]+]]
 ; SI-NOT: s_add_i32 [[SGPR]]
 
-define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid_f = uitofp i32 %tid to float
@@ -116,7 +116,7 @@ endif:
 ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index f44ae6e09e9..fb0bbaa9cbf 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -6,7 +6,7 @@
 
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_i32_e32
-define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
index 79bb5aff8f4..5c20e9a8d58 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -265,7 +265,7 @@ endif:                                            ; preds = %else, %if
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
-define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %tmp = load float, float addrspace(1)* %in0
   %tmp1 = fcmp oeq float %tmp, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 48bbc32abcb..b3cb19ad05e 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -11,7 +11,7 @@
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -33,7 +33,7 @@ define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -55,7 +55,7 @@ define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -77,7 +77,7 @@ define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -100,7 +100,7 @@ define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index b85714ea54c..744c1c2b682 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -25,7 +25,7 @@ define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -41,7 +41,7 @@ define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -57,7 +57,7 @@ define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -73,7 +73,7 @@ define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -89,7 +89,7 @@ define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -105,7 +105,7 @@ define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -121,7 +121,7 @@ define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -137,7 +137,7 @@ define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -155,7 +155,7 @@ define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -171,7 +171,7 @@ define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -188,7 +188,7 @@ define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -204,7 +204,7 @@ define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -220,7 +220,7 @@ define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -236,7 +236,7 @@ define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword v[[SHIFT]]
-define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -252,7 +252,7 @@ define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -268,7 +268,7 @@ define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspac
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -286,7 +286,7 @@ define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN: buffer_store_dword v[[SHRLO]]
-define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -306,7 +306,7 @@ define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addr
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -327,7 +327,7 @@ define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -347,7 +347,7 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
-define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -365,7 +365,7 @@ define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
-define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 28a7b924904..a803849be02 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 33
   store i64 %shl, i64 addrspace(1)* %out
@@ -43,7 +43,7 @@ define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -58,7 +58,7 @@ define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
   %shl = lshr i64 %and, 40
@@ -73,7 +73,7 @@ define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -96,7 +96,7 @@ define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -106,7 +106,7 @@ define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
 
 ; GCN-LABEL: {{^}}ashr_i64_const_32:
-define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; GCN-LABEL: {{^}}ashr_i64_const_63:
-define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -125,7 +125,7 @@ define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
@@ -137,7 +137,7 @@ define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 15
   %trunc = trunc i64 %shl to i16
@@ -149,7 +149,7 @@ define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in
   %shl = shl i32 %val, 15
   %trunc = trunc i32 %shl to i16
@@ -161,7 +161,7 @@ define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
 ; GCN: buffer_store_byte [[SHL]]
-define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 7
   %trunc = trunc i64 %shl to i8
@@ -174,7 +174,7 @@ define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
 ; GCN: buffer_store_byte [[AND]]
-define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i2
@@ -186,7 +186,7 @@ define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i32
@@ -198,7 +198,7 @@ define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 16
   %trunc = trunc i64 %shl to i32
@@ -209,7 +209,7 @@ define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[ZERO]]
-define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 33
   %trunc = trunc i64 %shl to i32
@@ -222,7 +222,7 @@ define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %shl = shl <2 x i64> %val, <i64 16, i64 16>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -235,7 +235,7 @@ define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> ad
 ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN: buffer_store_dword v[[RESLO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 8844273ecc3..f6520eeb4fd 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -17,7 +17,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -57,7 +57,7 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %a = load i16, i16 addrspace(1)* %in
   %b = load i16, i16 addrspace(1)* %b_ptr
@@ -70,7 +70,7 @@ define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %result = shl i16 %a, %b
   store i16 %result, i16 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b)
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %b.add = add i16 %b, 3
   %result = shl i16 %a, %b.add
@@ -92,7 +92,7 @@ define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in,
 ; GCN-LABEL: {{^}}shl_i16_computed_amount:
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}}
-define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
@@ -107,7 +107,7 @@ define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %
 
 ; GCN-LABEL: {{^}}shl_i16_i_s:
 ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12
-define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
+define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
   %result = shl i16 %a, 12
   store i16 %result, i16 addrspace(1)* %out
   ret void
@@ -116,7 +116,7 @@ define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
 ; GCN-LABEL: {{^}}shl_v2i16:
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -133,7 +133,7 @@ define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
@@ -160,7 +160,7 @@ define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; GCN-LABEL: {{^}}shl_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -199,7 +199,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -262,7 +262,7 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -277,7 +277,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -287,7 +287,7 @@ define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
-define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -299,7 +299,7 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_shl_constant_i64
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %shl = shl i64 281474976710655, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -311,7 +311,7 @@ define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
 ; SI: buffer_store_dwordx2
-define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1231231234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -323,7 +323,7 @@ define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
 ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
-define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -332,7 +332,7 @@ define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
-define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -341,7 +341,7 @@ define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
-define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -349,7 +349,7 @@ define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
-define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -357,7 +357,7 @@ define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4607182418800017408, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -365,7 +365,7 @@ define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13830554455654793216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -373,7 +373,7 @@ define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4602678819172646912, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -381,7 +381,7 @@ define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13826050856027422720, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -389,7 +389,7 @@ define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4611686018427387904, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -397,7 +397,7 @@ define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13835058055282163712, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -405,7 +405,7 @@ define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4616189618054758400, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -413,7 +413,7 @@ define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13839561654909534208, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -427,7 +427,7 @@ define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1082130432, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -439,7 +439,7 @@ define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 -1065353216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -450,7 +450,7 @@ define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4647714815446351872, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -460,7 +460,7 @@ define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13871086852301127680, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -468,7 +468,7 @@ define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 a
 
 ; FUNC-LABEL: {{^}}test_mul2:
 ; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-define void @test_mul2(i32 %p) {
+define amdgpu_kernel void @test_mul2(i32 %p) {
    %i = mul i32 %p, 2
    store volatile i32 %i, i32 addrspace(1)* undef
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
index bf4a6238eed..04145f777ba 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -13,7 +13,7 @@
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
   %result = shl <2 x i16> %lhs, %rhs
   store <2 x i16> %result, <2 x i16> addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16>
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -55,7 +55,7 @@ define void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -70,7 +70,7 @@ define void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -84,7 +84,7 @@ define void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GCN-LABEL: {{^}}shl_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
-define void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -98,7 +98,7 @@ define void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1
 ; GCN-LABEL: {{^}}shl_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -115,7 +115,7 @@ define void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -133,7 +133,7 @@ define void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
index 9b5f9fed4d7..9da4bc02801 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -25,7 +25,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; SI-DAG: buffer_store_dword [[ADDREG]]
 ; SI-DAG: buffer_store_dword [[SHLREG]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -43,7 +43,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -61,7 +61,7 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -78,7 +78,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6e45759fa05..9147eb58c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -19,7 +19,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
-define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -39,7 +39,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
-define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -55,7 +55,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-LABEL: {{^}}load_shl_base_lds_max_offset
 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
 ; SI: s_endpgm
-define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
   %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
@@ -73,7 +73,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI: s_mov_b32 m0, -1
 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
-define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -89,7 +89,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -104,7 +104,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 
 @lds2 = addrspace(3) global [512 x i32] undef, align 4
 
-; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -119,7 +119,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -134,7 +134,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -148,7 +148,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -162,7 +162,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -176,7 +176,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -190,7 +190,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -204,7 +204,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -214,7 +214,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
   ret void
 }
 
-; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -228,7 +228,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -242,7 +242,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -256,7 +256,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -270,7 +270,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 08d44cc1e1d..14ca635c6da 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -23,7 +23,7 @@ define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -40,7 +40,7 @@ define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace
 ; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -54,7 +54,7 @@ define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
-define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -68,7 +68,7 @@ define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
-define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -82,7 +82,7 @@ define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
-define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -96,7 +96,7 @@ define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
-define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -110,7 +110,7 @@ define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
-define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -124,7 +124,7 @@ define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
-define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -138,7 +138,7 @@ define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
-define void @s_test_i32_x_sub_64(i32 %x) #0 {
+define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
   %result = sub i32 %x, 64
   call void asm sideeffect "; use $0", "s"(i32 %result)
   ret void
@@ -147,7 +147,7 @@ define void @s_test_i32_x_sub_64(i32 %x) #0 {
 ; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
 ; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
-define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
@@ -166,7 +166,7 @@ define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in)
 
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 1988a14b584..6248d8a46da 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -7,7 +7,7 @@
 # resume crashes
 
 --- |
-  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -59,7 +59,7 @@
     ret void
   }
 
-  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@
     ret void
   }
 
-  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index ef616eb6380..2bc734cc522 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -9,7 +9,7 @@
 ; GCN: s_cbranch_vccnz
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -40,7 +40,7 @@ bb5:                                              ; preds = %bb3, %bb1
 ; GCN: s_cbranch_scc1
 ; GCN: s_endpgm
 ; GCN: .Lfunc_end1
-define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
index 50c38878d69..e50c595bc6c 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -11,7 +11,7 @@
 ; GCN: s_and_saveexec_b64
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index d658b229fd3..a4b6d1fd069 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,7 +10,7 @@
 ; SI: s_andn2_b64
 ; s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
 main_body:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = and i32 %a, %tid
@@ -40,7 +40,7 @@ ENDIF:
 ; SI: s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
 
-define void @phi_cond_outside_loop(i32 %b) {
+define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = icmp eq i32 %tid , 0
@@ -68,7 +68,7 @@ exit:
 ; CHECK-LABEL: {{^}}switch_unreachable:
 ; CHECK-NOT: s_endpgm
 ; CHECK: .Lfunc_end2
-define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -100,7 +100,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
-define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
 entry:
   %cmp = icmp sgt i32 %c0, 0
   br label %while.cond.outer
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index 025a3d8fca2..b0473f3b5bd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -6,7 +6,7 @@
 ; CHECK s_or_b64 exec, exec
 ; CHECK s_andn2_b64 exec, exec
 ; CHECK s_cbranch_execnz
-define void @test(i32 %arg, i32 %arg1) {
+define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
 bb:
   %tmp = icmp ne i32 %arg, 0
   %tmp7 = icmp ne i32 %arg1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index 0c08deb13a8..20052e865a5 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN
 
 --- |
-  define void @phi_visit_order() { ret void }
+  define amdgpu_kernel void @phi_visit_order() { ret void }
 
 name: phi_visit_order
 tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0d1de6662f2..580268deb85 100644
--- a/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load volatile i32, i32 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 8d66df258e4..89c1eeb8381 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -14,7 +14,7 @@
 ; GCN: [[UNREACHABLE]]:
 ; GCN: ds_write_b32
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator() #0 {
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -41,7 +41,7 @@ ret:
 ; GCN-NEXT: s_or_b64 exec, exec
 ; GCN: ds_write_b32
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index f5ae4f5fb73..114c97b61bd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -24,7 +24,7 @@
 
 ; SMEM: s_dcache_wb
 ; ALL: s_endpgm
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 313f5ee474e..8a4cee264fd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
-define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -33,7 +33,7 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -53,7 +53,7 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
-define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -77,7 +77,7 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
-define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -100,7 +100,7 @@ define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)*
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -122,7 +122,7 @@ define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %
 ; CI: s_load_dword
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
@@ -141,7 +141,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
 ; CI: buffer_load_dword
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
-define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
@@ -161,7 +161,7 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 ; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
@@ -187,7 +187,7 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 ; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
@@ -221,7 +221,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
 ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
-define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
+define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
index dd8783df5c3..7990990478a 100644
--- a/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -12,7 +12,7 @@
 ; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 
-define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
 entry:
   %0 = load i8, i8 addrspace(1)* %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
index 875351c5996..3e452c214e9 100644
--- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
 ; GCN: v_cndmask_b32_e64
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -14,7 +14,7 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpg
-define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -28,7 +28,7 @@ entry:
 ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -38,7 +38,7 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpgm
-define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN-LABEL: {{^}}v_sext_i32_to_i64:
 ; GCN: v_ashr
 ; GCN: s_endpgm
-define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -56,7 +56,7 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
-define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -65,7 +65,7 @@ define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
   store i16 %sext, i16 addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %cmp = and i1 %cmp0, %cmp1
@@ -91,7 +91,7 @@ define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %cmp0 = icmp eq i32 %a, %tid
   %cmp1 = icmp eq i32 %b, %c
@@ -130,7 +130,7 @@ define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-DAG: buffer_store_dword [[VEXT3]]
 
 ; GCN: s_endpgm
-define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -162,7 +162,7 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: buffer_store_dword [[EXT1]]
 ; GCN: buffer_store_dword [[EXT2]]
 ; GCN: buffer_store_dword [[EXT3]]
-define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32, i32 addrspace(1)* %in
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
@@ -184,7 +184,7 @@ define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-DAG: s_sext_i32_i16
 ; GCN-DAG: s_sext_i32_i16
 ; GCN: s_endpgm
-define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -206,7 +206,7 @@ define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: s_endpgm
-define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %a = load i64, i64 addrspace(1)* %in
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 68dc3c6ccd2..f98a716b4fd 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_i32_e32
-define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -31,14 +31,14 @@ define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: @s_sint_to_fp_i64_to_f64
-define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %result = sitofp i64 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -51,7 +51,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 5df8105116c..04cd199b81a 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16:
-define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]]
-define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -39,7 +39,7 @@ define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
-define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
-define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -74,14 +74,14 @@ define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32:
-define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -93,14 +93,14 @@ define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16:
-define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
index 4c8fea12bad..8e85d999859 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
 
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = sitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b216cbb2958..bbe8d7ce469 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i16_to_f16(
+define amdgpu_kernel void @sitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i32_to_f16(
+define amdgpu_kernel void @sitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -45,7 +45,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @sitofp_v2i16_to_v2f16(
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -65,7 +65,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @sitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/smed3.ll b/llvm/test/CodeGen/AMDGPU/smed3.ll
index 16d743988d5..8665ab69726 100644
--- a/llvm/test/CodeGen/AMDGPU/smed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/smed3.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
 ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -25,7 +25,7 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
 ; GCN: v_max_i32
 ; GCN: v_min_i32
-define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -64,7 +64,7 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
 ; GCN: v_cmp_lt_i64
 ; GCN: v_cmp_gt_i64
-define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -102,7 +102,7 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
 ; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -174,7 +174,7 @@ define internal i8 @smax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -186,7 +186,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -198,7 +198,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -210,7 +210,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -222,7 +222,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -234,7 +234,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -246,7 +246,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -258,7 +258,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -270,7 +270,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -282,7 +282,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -294,7 +294,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -306,7 +306,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -318,7 +318,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -330,7 +330,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -342,7 +342,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -354,7 +354,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -370,7 +370,7 @@ bb:
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -385,7 +385,7 @@ bb:
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
@@ -397,7 +397,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -410,7 +410,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -423,7 +423,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -436,7 +436,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -457,7 +457,7 @@ bb:
 ; VI: v_max_i16
 
 ; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.ll b/llvm/test/CodeGen/AMDGPU/sminmax.ll
index ce5d9245164..827d672022e 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.ll
@@ -7,7 +7,7 @@
 ; GCN: s_add_i32
 
 ; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
@@ -22,7 +22,7 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
 ; GCN: v_add_i32
 
 ; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -36,7 +36,7 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
-define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -54,7 +54,7 @@ define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %sr
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -79,7 +79,7 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -109,7 +109,7 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -146,7 +146,7 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -170,7 +170,7 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
@@ -186,7 +186,7 @@ define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
@@ -208,7 +208,7 @@ define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
-define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
   %cond0 = icmp sgt <4 x i32> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
@@ -223,7 +223,7 @@ define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 01d19a0c24b..8da5adad30d 100644
--- a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -17,7 +17,7 @@
 ; CIVI: v_add_i32_e32
 ; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
 ; CIVI: v_or_b32_e32
-define void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %neg = sub <2 x i16> zeroinitializer, %val
   %cond = icmp sgt <2 x i16> %val, %neg
   %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
@@ -41,7 +41,7 @@ define void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
 ; VI-NOT: v_and_b32
 ; VI: v_or_b32_e32
-define void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -59,7 +59,7 @@ define void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
-define void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -77,7 +77,7 @@ define void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
 ; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
-define void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
   %z0 = insertelement <2 x i16> undef, i16 0, i16 0
   %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
   %t0 = insertelement <2 x i16> undef, i16 2, i16 0
@@ -101,7 +101,7 @@ define void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
-define void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
+define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -128,7 +128,7 @@ define void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
 ; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
 ; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
 ; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
-define void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
+define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
   %z0 = insertelement <4 x i16> undef, i16 0, i16 0
   %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
   %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
@@ -147,7 +147,7 @@ define void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v2i16:
-define void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
   %cond0 = icmp sgt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
@@ -158,7 +158,7 @@ define void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16:
-define void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
   %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
   %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
 
@@ -172,7 +172,7 @@ define void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(
 }
 
 ; GCN-LABEL: {{^}}s_min_max_v4i32:
-define void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
   %cond0 = icmp sgt <4 x i16> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
@@ -183,7 +183,7 @@ define void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(
 }
 
 ; GCN-LABEL: {{^}}v_min_max_v2i16_user:
-define void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
   %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
   %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
 
@@ -200,7 +200,7 @@ define void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrs
 ; GCN-LABEL: {{^}}u_min_max_v2i16:
 ; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
   %cond0 = icmp ugt <2 x i16> %val0, %val1
   %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
   %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
diff --git a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index daac5b92b1e..343211b0219 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
   %cnd = fcmp oeq float 0.0, %cond
   %sgpr = load volatile i32, i32 addrspace(2)* %in
@@ -32,7 +32,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
   %vgpr = load volatile float, float addrspace(1)* %in
   %cnd = fcmp oeq float 0.0, %vgpr
diff --git a/llvm/test/CodeGen/AMDGPU/smrd.ll b/llvm/test/CodeGen/AMDGPU/smrd.ll
index 2384b741973..50f72c67059 100644
--- a/llvm/test/CodeGen/AMDGPU/smrd.ll
+++ b/llvm/test/CodeGen/AMDGPU/smrd.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -18,7 +18,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -33,7 +33,7 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -48,7 +48,7 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -62,7 +62,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %tmp1 = load i32, i32 addrspace(2)* %tmp
@@ -76,7 +76,7 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
   %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %tmp1 = load i32, i32 addrspace(2)* %tmp
diff --git a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
index 74acc5bc961..c0f773ca70c 100644
--- a/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.groupstaticsize() #1
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
-define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -25,7 +25,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -41,7 +41,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -57,7 +57,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1:
 ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -73,7 +73,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -89,7 +89,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -105,7 +105,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -121,7 +121,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -137,7 +137,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}}
-define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65536
   br i1 %cmp0, label %endif, label %if
@@ -154,7 +154,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -170,7 +170,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -186,7 +186,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -202,7 +202,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -218,7 +218,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1:
 ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -234,7 +234,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sge_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %cond, %size
@@ -251,7 +251,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp slt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -267,7 +267,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sle_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %cond, %size
@@ -284,7 +284,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ugt_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %cond, %size
@@ -301,7 +301,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_uge_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %cond, %size
@@ -318,7 +318,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -334,7 +334,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16:
 ; GCN: s_cmp_lt_u32 s2, 0xffff8000
-define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -350,7 +350,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1:
 ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -366,7 +366,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ule_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %cond, %size
@@ -383,7 +383,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp eq i32 %size, %cond
@@ -400,7 +400,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ne i32 %size, %cond
@@ -417,7 +417,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sgt i32 %size, %cond
@@ -434,7 +434,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %size, %cond
@@ -451,7 +451,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp slt i32 %size, %cond
@@ -468,7 +468,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %size, %cond
@@ -485,7 +485,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %size, %cond
@@ -502,7 +502,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %size, %cond
@@ -519,7 +519,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ult i32 %size, %cond
@@ -536,7 +536,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %size, %cond
@@ -553,7 +553,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
 ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %not.size = xor i32 %size, -1
@@ -573,7 +573,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_eq_u64_e64
-define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -593,7 +593,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_eq_u64_e32
-define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
@@ -611,7 +611,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_ne_u64_e64
-define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -631,7 +631,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_ne_u64_e32
-define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index ff9429843b2..63ea21b0533 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -4,7 +4,7 @@
 ; allocate scratch registers correctly. Check that this test compiles without
 ; error.
 ; TONGA-LABEL: test
-define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
 entry:
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
index 686c83116fd..1ca0919258a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT: s_or_b64 exec
 ; CHECK: buffer_
 
-define void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
index 3d88b9aa3da..0e715c45320 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll
@@ -43,7 +43,7 @@
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
-define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %cmp0 = icmp eq i32 %cond, 0
@@ -136,7 +136,7 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define void @m0_unavailable_spill(i32 %m0.arg) #0 {
+define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 {
 main_body:
   %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0
   %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg)
@@ -189,7 +189,7 @@ endif:
 
 ; TOSMEM: s_dcache_wb
 ; TOSMEM: s_endpgm
-define void @restore_m0_lds(i32 %arg) {
+define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %sval = load volatile i64, i64 addrspace(2)* undef
   %cmp = icmp eq i32 %arg, 0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9b3dfab2be6..c05021a91ff 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -11,7 +11,7 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
diff --git a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index d4e2dc81405..5d7d29db3a2 100644
--- a/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -23,7 +23,7 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
 ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
-define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -35,7 +35,7 @@ define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -48,7 +48,7 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
-define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -61,7 +61,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep
diff --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index da6c7204cd1..c2426993bb3 100644
--- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -29,7 +29,7 @@
 ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24
 
 ; GCN: s_endpgm
-define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
+define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
 entry:
   %tmp = tail call i32 @llvm.r600.read.local.size.y()
   %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit.mir b/llvm/test/CodeGen/AMDGPU/splitkit.mir
index 1120ad77b53..ee58138b1af 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit.mir
+++ b/llvm/test/CodeGen/AMDGPU/splitkit.mir
@@ -1,7 +1,7 @@
 # RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
 --- |
-  define void @func0() #0 { ret void }
-  define void @func1() #0 { ret void }
+  define amdgpu_kernel void @func0() #0 { ret void }
+  define amdgpu_kernel void @func1() #0 { ret void }
 
   attributes #0 = { "amdgpu-num-sgpr"="12" }
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/sra.ll b/llvm/test/CodeGen/AMDGPU/sra.ll
index ad7c86fe791..25590c93d1e 100644
--- a/llvm/test/CodeGen/AMDGPU/sra.ll
+++ b/llvm/test/CodeGen/AMDGPU/sra.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -37,7 +37,7 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -51,7 +51,7 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; global load we end up with the vector instructions rather than scalar.
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
@@ -67,7 +67,7 @@ define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %i
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
@@ -80,7 +80,7 @@ define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %i
 ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
 ; EG: ASHR
-define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %in.ext = sext i32 %in to i64
   %ashr = ashr i64 %in.ext, 8
@@ -105,7 +105,7 @@ entry:
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
@@ -143,7 +143,7 @@ entry:
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -156,7 +156,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; XFUNC-LABEL: {{^}}s_ashr_v2i64:
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
-; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
 ;   %result = ashr <2 x i64> %a, %b
 ;   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
 ;   ret void
@@ -221,7 +221,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -235,7 +235,7 @@ define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -247,7 +247,7 @@ define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; VI: flat_load_dword v[[HI:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
-define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -262,7 +262,7 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
-define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index c78fd549b31..c89f798397a 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
-define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -11,7 +11,7 @@ define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -24,14 +24,14 @@ define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
 ; SI: s_endpgm
-define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -40,14 +40,14 @@ define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -56,14 +56,14 @@ define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %num = load i64, i64 addrspace(1) * %in
   %den = load i64, i64 addrspace(1) * %den_ptr
@@ -72,14 +72,14 @@ define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %num = load i64, i64 addrspace(1) * %in
   %result = srem i64 %num, 4
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
@@ -88,14 +88,14 @@ define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %result = srem <2 x i64> %num, <i64 4, i64 4>
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
@@ -104,7 +104,7 @@ define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/srl.ll b/llvm/test/CodeGen/AMDGPU/srl.ll
index 6b006fd936d..1daf4bb33e8 100644
--- a/llvm/test/CodeGen/AMDGPU/srl.ll
+++ b/llvm/test/CodeGen/AMDGPU/srl.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -50,7 +50,7 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -74,7 +74,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -112,7 +112,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; EG-DAG: CNDE_INT {{.*}}, 0.0
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -178,7 +178,7 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -193,7 +193,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@ define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
-define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/ssubo.ll b/llvm/test/CodeGen/AMDGPU/ssubo.ll
index 26884a1b776..135632343f9 100644
--- a/llvm/test/CodeGen/AMDGPU/ssubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/ssubo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -17,7 +17,7 @@ define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i32:
-define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -27,7 +27,7 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
-define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -41,7 +41,7 @@ define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 ; FUNC-LABEL: {{^}}s_ssubo_i64:
 ; SI: s_sub_u32
 ; SI: s_subb_u32
-define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -53,7 +53,7 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_ssubo_i64:
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/store-barrier.ll b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
index 57a93ccd250..afa4e94222c 100644
--- a/llvm/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-barrier.ll
@@ -12,7 +12,7 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
   %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/store-global.ll b/llvm/test/CodeGen/AMDGPU/store-global.ll
index 5d49795a68e..160e921fc07 100644
--- a/llvm/test/CodeGen/AMDGPU/store-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-global.ll
@@ -11,7 +11,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i1(i1 addrspace(1)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@ entry:
 
 ; GCN: buffer_store_byte
 
-define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(1)* %out
   ret void
@@ -75,7 +75,7 @@ entry:
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
 ; GCN: buffer_store_short
-define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@ entry:
 
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
-define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT
-define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1
@@ -150,7 +150,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out
@@ -170,7 +170,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2
@@ -183,7 +183,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out
@@ -210,7 +210,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1
@@ -231,7 +231,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2
@@ -246,7 +246,7 @@ entry:
 
 ; GCN: buffer_store_dword
 
-define void @store_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
@@ -257,7 +257,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}
 
 ; GCN: buffer_store_dwordx2
-define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(1)* %out
@@ -272,7 +272,7 @@ entry:
 
 ; GCN: buffer_store_dwordx2
 
-define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -286,7 +286,7 @@ entry:
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
-define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void
 }
@@ -299,7 +299,7 @@ define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
@@ -313,7 +313,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; SI: buffer_store_dwordx4
-define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
@@ -340,7 +340,7 @@ define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1
 ; CM: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(1)* %out
@@ -350,7 +350,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG: MEM_RAT MSKOR
 ; GCN: buffer_store_short
-define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(1)* %out
@@ -369,7 +369,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx2
-define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -388,7 +388,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
 
 ; GCN: buffer_store_dwordx4
-define void @i128-const-store(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/store-local.ll b/llvm/test/CodeGen/AMDGPU/store-local.ll
index 03fd30ca9a2..c144bf2aa87 100644
--- a/llvm/test/CodeGen/AMDGPU/store-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-local.ll
@@ -9,7 +9,7 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i1(i1 addrspace(3)* %out) {
+define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
@@ -21,7 +21,7 @@ entry:
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
 ; CM: LDS_SHORT_WRITE
 
 ; GCN: ds_write_b16
-define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
@@ -43,7 +43,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
@@ -55,7 +55,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
@@ -78,7 +78,7 @@ entry:
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1
   ret void
@@ -95,7 +95,7 @@ entry:
 
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2
   ret void
@@ -111,7 +111,7 @@ entry:
 ; CM-NOT: LDS_WRITE
 
 ; GCN: ds_write_b64
-define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
@@ -129,7 +129,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write2_b64
-define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
@@ -148,7 +148,7 @@ entry:
 
 ; GCN: ds_write2_b32
 ; GCN: ds_write2_b32
-define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG: LDS_BYTE_WRITE
 ; GCN: ds_write_b8
-define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(3)* %out
@@ -167,7 +167,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG: LDS_SHORT_WRITE
 ; GCN: ds_write_b16
-define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(3)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/store-private.ll b/llvm/test/CodeGen/AMDGPU/store-private.ll
index 33d27f24e9c..ab73ada370e 100644
--- a/llvm/test/CodeGen/AMDGPU/store-private.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-private.ll
@@ -15,7 +15,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i1(i1 addrspace(0)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) {
 entry:
   store i1 true, i1 addrspace(0)* %out
   ret void
@@ -44,7 +44,7 @@ entry:
 
 ; SI: buffer_store_byte
 
-define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(0)* %out
   ret void
@@ -72,7 +72,7 @@ entry:
 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
 
 ; SI: buffer_store_short
-define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(0)* %out
   ret void
@@ -102,7 +102,7 @@ entry:
 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
-define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(0)* %out
   ret void
@@ -120,7 +120,7 @@ entry:
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 ; CM-NOT: MOVA_INT
-define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(0)* %out
   ret void
@@ -141,7 +141,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out
@@ -172,7 +172,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
@@ -191,7 +191,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out
@@ -223,7 +223,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
@@ -240,7 +240,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out
@@ -299,7 +299,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
@@ -410,7 +410,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
 entry:
   %0 = trunc <8 x i32> %in to <8 x i8>
   store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
@@ -443,7 +443,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
@@ -460,7 +460,7 @@ entry:
 
 ; SI: buffer_store_dword
 
-define void @store_f32(float addrspace(0)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) {
   store float %in, float addrspace(0)* %out
   ret void
 }
@@ -480,7 +480,7 @@ define void @store_f32(float addrspace(0)* %out, float %in) {
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(0)* %out
@@ -504,7 +504,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -533,7 +533,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
   ret void
 }
@@ -563,7 +563,7 @@ define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out
   ret void
@@ -594,7 +594,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
   ret void
@@ -626,7 +626,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(0) * %in
   store <4 x float> %1, <4 x float> addrspace(0)* %out
   ret void
@@ -644,7 +644,7 @@ define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(0)* %out
@@ -663,7 +663,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_short
-define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(0)* %out
@@ -689,7 +689,7 @@ entry:
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -727,7 +727,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @i128-const-store(i32 addrspace(0)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) {
 entry:
   store i32 1, i32 addrspace(0)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index 78db2d37724..7518e887135 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
@@ -40,7 +40,7 @@ define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
@@ -48,7 +48,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
 ; GCN-LABEL: {{^}}local_store_v3i64:
 ; GCN: ds_write2_b64
 ; GCN: ds_write_b64
-define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
@@ -83,7 +83,7 @@ define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
   ret void
 }
@@ -91,7 +91,7 @@ define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64>
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
   store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@ define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i16>
   store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
   ret void
@@ -110,7 +110,7 @@ define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i8>
   store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
   ret void
@@ -120,7 +120,7 @@ define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i1>
   store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index d5af3b29118..507f07dee05 100644
--- a/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -5,7 +5,7 @@
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
-define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/store_typed.ll b/llvm/test/CodeGen/AMDGPU/store_typed.ll
index 515fcf04f40..eaa21617f93 100644
--- a/llvm/test/CodeGen/AMDGPU/store_typed.ll
+++ b/llvm/test/CodeGen/AMDGPU/store_typed.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0)
   ret void
 }
@@ -16,7 +16,7 @@ define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
 ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11)
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/structurize.ll b/llvm/test/CodeGen/AMDGPU/structurize.ll
index 174e64e2cf8..3cceb2d45c9 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize.ll
@@ -45,7 +45,7 @@
 ; CHECK: CF_END
 
 
-define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
 %0 = icmp ne i32 %a, 0
   br i1 %0, label %diamond_head, label %branch_from
diff --git a/llvm/test/CodeGen/AMDGPU/structurize1.ll b/llvm/test/CodeGen/AMDGPU/structurize1.ll
index db0f50247e3..2e7d0e615e0 100644
--- a/llvm/test/CodeGen/AMDGPU/structurize1.ll
+++ b/llvm/test/CodeGen/AMDGPU/structurize1.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
-define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   br label %for.body
 
diff --git a/llvm/test/CodeGen/AMDGPU/sub.i16.ll b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
index b2797ceecf3..ada72140563 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.i16.ll
@@ -7,7 +7,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -39,7 +39,7 @@ define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -54,7 +54,7 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -70,7 +70,7 @@ define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -90,7 +90,7 @@ define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -110,7 +110,7 @@ define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -131,7 +131,7 @@ define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -149,7 +149,7 @@ define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %size.trunc = trunc i32 %size to i16
   call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
diff --git a/llvm/test/CodeGen/AMDGPU/sub.ll b/llvm/test/CodeGen/AMDGPU/sub.ll
index 5816345098a..f366029fdea 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -55,7 +55,7 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
 }
 
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
     %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
     %a = load i16, i16 addrspace(1)* %in 
     %b = load i16, i16 addrspace(1)* %b_ptr
@@ -69,7 +69,7 @@ define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
     %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
     %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
@@ -85,7 +85,7 @@ define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
     %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
     %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
@@ -103,7 +103,7 @@ define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -118,7 +118,7 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -134,7 +134,7 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias
 ; SI: v_subb_u32_e32
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -154,7 +154,7 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
 ; SI: v_subb_u32_e32
 ; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
index b282b510806..6736f89a655 100644
--- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -7,7 +7,7 @@
 
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -27,7 +27,7 @@ define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 
 ; VI: s_sub_i32
 ; VI: s_sub_i32
-define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
   %add = sub <2 x i16> %a, %b
@@ -38,7 +38,7 @@ define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
 ; GCN: buffer_store_dword [[ZERO]]
-define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
   %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
   %add = sub <2 x i16> %a, %a
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrs
 
 ; VI: v_subrev_i32_e32
 ; VI: v_subrev_i32_e32
-define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
   %add = sub <2 x i16> %a, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
   ret void
@@ -63,7 +63,7 @@ define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
-define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -80,7 +80,7 @@ define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> a
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
-define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -99,7 +99,7 @@ define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i1
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -118,7 +118,7 @@ define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -138,7 +138,7 @@ define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2
 ; VI-NOT: v_subrev_i16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -170,7 +170,7 @@ define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -205,7 +205,7 @@ define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; VI-DAG: v_subrev_u16_e32
 
 ; VI: buffer_store_dwordx4
-define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -231,7 +231,7 @@ define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; VI: v_subrev_u16_e32
 ; VI: v_subrev_u16_e32
 ; VI: buffer_store_dwordx2
-define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@@ -259,7 +259,7 @@ define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index 0ec45c68bb7..c2d04abf829 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL:{{^}}row_filter_C1_D0:
-define void @row_filter_C1_D0() #0 {
+define amdgpu_kernel void @row_filter_C1_D0() #0 {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 72a1f1e25b3..35615c40d49 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -20,7 +20,7 @@ target triple="amdgcn--"
 ; CHECK-NEXT: s_mov_b32 s6, -1
 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
-define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
index 8bd995a8ecb..57c267e54a1 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -5,7 +5,7 @@
 ; Just make sure this test doesn't crash.
 ; CHECK-LABEL: foobar:
 ; CHECK: s_endpgm
-define void @foobar() {
+define amdgpu_kernel void @foobar() {
   %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
   %v3 = sext <4 x i1> %v0 to <4 x i32>
   %v4 = extractelement <4 x i32> %v3, i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir b/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
index c4e00215159..c477fe9bc6d 100644
--- a/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/subreg-intervals.mir
@@ -10,8 +10,8 @@
 # CHECK-LABEL: Machine code for function test1:
 
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 name: test0
diff --git a/llvm/test/CodeGen/AMDGPU/target-cpu.ll b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
index cf80ff3f4c8..466e89ebee8 100644
--- a/llvm/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/llvm/test/CodeGen/AMDGPU/target-cpu.ll
@@ -14,7 +14,7 @@ declare void @llvm.amdgcn.s.dcache.wb() #0
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_none() #0 {
+define amdgpu_kernel void @target_none() #0 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -30,7 +30,7 @@ define void @target_none() #0 {
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_tahiti() #1 {
+define amdgpu_kernel void @target_tahiti() #1 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -46,7 +46,7 @@ define void @target_tahiti() #1 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
-define void @target_bonaire() #3 {
+define amdgpu_kernel void @target_bonaire() #3 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -63,7 +63,7 @@ define void @target_bonaire() #3 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
-define void @target_fiji() #4 {
+define amdgpu_kernel void @target_fiji() #4 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -79,7 +79,7 @@ define void @target_fiji() #4 {
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
 ; CHECK: ; LDSByteSize: 5120
-define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -93,7 +93,7 @@ entry:
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/trap.ll b/llvm/test/CodeGen/AMDGPU/trap.ll
index 15f055e8c2b..77ad895d0e8 100644
--- a/llvm/test/CodeGen/AMDGPU/trap.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap.ll
@@ -38,7 +38,7 @@ declare void @llvm.debugtrap() #0
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_trap() {
+define amdgpu_kernel void @hsa_trap() {
   call void @llvm.trap()
   ret void
 }
@@ -64,7 +64,7 @@ define void @hsa_trap() {
 ; TRAP-BIT: enable_trap_handler = 1
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-MESA-TRAP: s_endpgm
-define void @hsa_debugtrap() {
+define amdgpu_kernel void @hsa_debugtrap() {
   call void @llvm.debugtrap()
   ret void
 }
@@ -75,7 +75,7 @@ define void @hsa_debugtrap() {
 ; NO-TRAP-BIT: enable_trap_handler = 0
 ; NO-HSA-TRAP: s_endpgm
 ; NO-MESA-TRAP: s_endpgm
-define void @trap() {
+define amdgpu_kernel void @trap() {
   call void @llvm.trap()
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index a331475820a..f90040385f7 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
@@ -15,7 +15,7 @@ define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
@@ -26,7 +26,7 @@ define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
@@ -38,7 +38,7 @@ define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspac
 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
@@ -54,7 +54,7 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace
 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
@@ -66,7 +66,7 @@ define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace
 ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
 ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
@@ -77,7 +77,7 @@ define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)*
 ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
@@ -88,7 +88,7 @@ define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)
 ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 7a4bced9d43..cb8d3655033 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -25,7 +25,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -36,7 +36,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -71,7 +71,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -107,7 +107,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -122,7 +122,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -148,7 +148,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -162,7 +162,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
   %load = load i8, i8 addrspace(1)* %in.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index 03b8af0610d..d67b8f981b2 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %val = load double, double addrspace(1)* %in
   %cvt = fptrunc double %val to half
   store half %cvt, half addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrsp
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %val = load <3 x double>, <3 x double> addrspace(1)* %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %val = load <4 x double>, <4 x double> addrspace(1)* %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
   %val = load <8 x double>, <8 x double> addrspace(1)* %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
   %val = load <16 x double>, <16 x double> addrspace(1)* %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
index da2a5b43dad..4ea2352f57f 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -7,7 +7,7 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -15,7 +15,7 @@ define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwi
 
 ; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -26,13 +26,13 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-store.ll b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
index c6727e1e127..f45de679588 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-store.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 878ea3f4899..3dbc10d2e9b 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index a8f10cc5a9c..0c91d52df0c 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
 ; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
 ; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
@@ -28,7 +28,7 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
 ; SI: buffer_store_dword [[VSHL]]
 ; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
 
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
 ; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
 ; GCN: v_mov_b32_e32
 ; GCN: v_mov_b32_e32
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
@@ -57,7 +57,7 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64
 
 ; GCN-LABEL: {{^}}trunc_i32_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
@@ -67,7 +67,7 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
 
 ; GCN-LABEL: {{^}}trunc_i8_to_i1:
 ; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
-define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
+define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
   %a = load i8, i8 addrspace(1)* %ptr, align 4
   %trunc = trunc i8 %a to i1
   %result = select i1 %trunc, i8 1, i8 0
@@ -77,7 +77,7 @@ define void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
   %trunc = trunc i16 %a to i1
   %result = select i1 %trunc, i16 1, i16 0
   store i16 %result, i16 addrspace(1)* %out, align 4
@@ -86,7 +86,7 @@ define void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
 
 ; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
 ; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -99,7 +99,7 @@ define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
 ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
 ; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
@@ -112,7 +112,7 @@ define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
 ; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
 ; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
index 76c32afc1f2..7c369a31276 100644
--- a/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
+++ b/llvm/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -19,7 +19,7 @@
 ; CHECK: store i8 0, i8 addrspace(1)*
 ; CHECK-NOT: store i8 0, i8 addrspace(1)*
 ; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index d62e4e0b140..632ccaa7e61 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -9,7 +9,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -27,7 +27,7 @@ define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -42,7 +42,7 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -85,7 +85,7 @@ define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryou
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -100,7 +100,7 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -118,7 +118,7 @@ define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_uaddo_i16:
 ; VI: v_add_u16_e32
 ; VI: v_cmp_lt_u16_e32
-define void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index ed791bc3a5c..2874a0cdbc0 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -10,7 +10,7 @@
 ; EG: CF_END
 
 ; SI: v_rcp_iflag_f32_e32
-define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -21,7 +21,7 @@ define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_udiv_i32:
 ; SI: v_rcp_iflag_f32_e32
-define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; SI: v_rcp_iflag_f32_e32
 ; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
-define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -50,7 +50,7 @@ define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}udiv_v4i32:
 ; EG: CF_END
 ; SI: s_endpgm
-define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -63,7 +63,7 @@ define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 16
@@ -77,7 +77,7 @@ define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259182
@@ -91,7 +91,7 @@ define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259183
@@ -103,7 +103,7 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -117,7 +117,7 @@ define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in
   %den = load i16, i16 addrspace(1) * %den_ptr
@@ -131,7 +131,7 @@ define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -143,7 +143,7 @@ define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_udiv_i24:
 ; SI-NOT: v_rcp_f32
-define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -159,7 +159,7 @@ define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
 
-define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
@@ -168,7 +168,7 @@ define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %i
 
 ; FUNC-LABEL: {{^}}test_udiv2:
 ; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-define void @test_udiv2(i32 %p) {
+define amdgpu_kernel void @test_udiv2(i32 %p) {
   %i = udiv i32 %p, 2
   store volatile i32 %i, i32 addrspace(1)* undef
   ret void
@@ -178,7 +178,7 @@ define void @test_udiv2(i32 %p) {
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
 ; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-define void @test_udiv_3_mulhu(i32 %p) {
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
    %i = udiv i32 %p, 3
    store volatile i32 %i, i32 addrspace(1)* undef
    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 17f4ebf175d..9507a49cfc8 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
@@ -158,7 +158,7 @@ define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
   %result1 = urem <2 x i32> %x, %y
@@ -340,7 +340,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
   %result1 = urem <4 x i32> %x, %y
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index 6d145f1dbf0..6f144dcc6fd 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -67,7 +67,7 @@ define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -84,7 +84,7 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -101,7 +101,7 @@ define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -121,7 +121,7 @@ define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -141,7 +141,7 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -161,7 +161,7 @@ define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -184,7 +184,7 @@ define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -203,7 +203,7 @@ define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -215,7 +215,7 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -235,7 +235,7 @@ define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -255,7 +255,7 @@ define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -275,7 +275,7 @@ define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -294,7 +294,7 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -313,7 +313,7 @@ define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem64.ll b/llvm/test/CodeGen/AMDGPU/udivrem64.ll
index da61a841ff3..bd297920d56 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem64.ll
@@ -70,7 +70,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -195,7 +195,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -214,7 +214,7 @@ define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index a4e18ebc912..62943aeefbd 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
@@ -19,21 +19,21 @@ define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -42,7 +42,7 @@ define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
@@ -52,7 +52,7 @@ define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -64,7 +64,7 @@ define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -79,7 +79,7 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -91,7 +91,7 @@ define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index cd816b27fce..4168326e14c 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16:
-define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]]
-define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -35,7 +35,7 @@ define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
-define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -65,14 +65,14 @@ define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32:
-define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32:
-define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -83,14 +83,14 @@ define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16:
-define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16:
-define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
index 3003226ca1a..2e9918717c3 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_u32_e32
 
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@ define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)*
 ; R600-DAG: SETGT_UINT
 ; R600-DAG: SETE_INT
 
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 entry:
   %cvt = uitofp i64 %in to float
   store float %cvt, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 18bd34880af..4df2681fa9a 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i16_to_f16(
+define amdgpu_kernel void @uitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i32_to_f16(
+define amdgpu_kernel void @uitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -48,7 +48,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i16_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -68,7 +68,7 @@ entry:
 ; GCN-DAG: v_or_b32_e32
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/umed3.ll b/llvm/test/CodeGen/AMDGPU/umed3.ll
index 326a4460ab8..5a579f3575f 100644
--- a/llvm/test/CodeGen/AMDGPU/umed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/umed3.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -25,7 +25,7 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -45,7 +45,7 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -64,7 +64,7 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -83,7 +83,7 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -102,7 +102,7 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
 ; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define internal i8 @umax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -185,7 +185,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -197,7 +197,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -209,7 +209,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -221,7 +221,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -233,7 +233,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -245,7 +245,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -257,7 +257,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -269,7 +269,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -281,7 +281,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -293,7 +293,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -305,7 +305,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -317,7 +317,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -329,7 +329,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -341,7 +341,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -353,7 +353,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -368,7 +368,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -383,7 +383,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
@@ -395,7 +395,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -408,7 +408,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -421,7 +421,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -434,7 +434,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -447,7 +447,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
@@ -459,7 +459,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
@@ -471,7 +471,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -491,7 +491,7 @@ bb:
 ; VI: v_max_u16
 
 ; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 0f76a54975e..68aacd084bf 100644
--- a/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
@@ -23,7 +23,7 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
@@ -42,7 +42,7 @@ define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
@@ -60,7 +60,7 @@ define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
@@ -74,7 +74,7 @@ define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 2
   ret void
@@ -85,7 +85,7 @@ define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 2
   store i32 %v, i32 addrspace(3)* %r, align 2
   ret void
@@ -132,7 +132,7 @@ define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)*
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
@@ -179,7 +179,7 @@ define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
   ret void
@@ -209,7 +209,7 @@ define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 2
   ret void
@@ -239,7 +239,7 @@ define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
@@ -286,7 +286,7 @@ define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
@@ -329,7 +329,7 @@ define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
@@ -337,7 +337,7 @@ define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -345,7 +345,7 @@ define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -356,7 +356,7 @@ define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -375,7 +375,7 @@ define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocaptur
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -383,7 +383,7 @@ define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
@@ -391,7 +391,7 @@ define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
@@ -401,7 +401,7 @@ define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -418,7 +418,7 @@ define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -430,7 +430,7 @@ define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)*
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -444,7 +444,7 @@ define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r
 
 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -453,7 +453,7 @@ define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 4
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -462,7 +462,7 @@ define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -482,7 +482,7 @@ define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> ad
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
   ret void
@@ -512,7 +512,7 @@ define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32>
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -521,7 +521,7 @@ define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32>
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
   store i8 %v, i8 addrspace(1)* %r, align 4
   ret void
@@ -530,7 +530,7 @@ define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
   store i8 %v, i8 addrspace(1)* %r, align 2
   ret void
@@ -541,7 +541,7 @@ define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
   %v0 = load i32, i32 addrspace(2)* %p, align 4
   %v1 = load i32, i32 addrspace(2)* %gep0, align 4
@@ -571,7 +571,7 @@ define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspac
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
   ret void
@@ -596,7 +596,7 @@ define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> add
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
   store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index f92a847b3e5..3e80fcf85b5 100644
--- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -4,7 +4,7 @@
 
 
 ; CHECK-LABEL: {{^}}func:
-define void @func() #0 {
+define amdgpu_kernel void @func() #0 {
 B0:
   br i1 undef, label %B1, label %B2
 
@@ -72,7 +72,7 @@ bb11:                                             ; preds = %bb9
 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
-define void @partially_undef_copy() #0 {
+define amdgpu_kernel void @partially_undef_copy() #0 {
   %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
   %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
 
diff --git a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index d96ee6d21ce..60ab7631a10 100644
--- a/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/llvm/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -42,7 +42,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_vccnz
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -79,7 +79,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
index 44330e68356..a9d45d71fa2 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -40,7 +40,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -68,7 +68,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -96,7 +96,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -123,7 +123,7 @@ done:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -148,7 +148,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -176,7 +176,7 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -209,7 +209,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -233,7 +233,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -258,7 +258,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: {{^}}[[BODY]]:
 ; GCN: buffer_store
 ; GCN: s_endpgm
-define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -284,7 +284,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
 
@@ -310,7 +310,7 @@ done:
 ; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
@@ -338,7 +338,7 @@ endif:
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
@@ -370,7 +370,7 @@ endif:
 ; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
@@ -410,7 +410,7 @@ exit:
 
 ; GCN: BB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
@@ -445,7 +445,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -477,7 +477,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -505,7 +505,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -524,7 +524,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -543,7 +543,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
index cfbb2af5867..028199ef9de 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -6,7 +6,7 @@
 ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -25,7 +25,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
 ; GCN: s_cbranch_scc1 [[LOOP]]
 ; GCN: {{^}}[[BB0]]:
-define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
 bb:
   %cnd = trunc i32 %arg to i1
   br i1 %cnd, label %bb2, label %bb5
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 2c3a0981886..e0067f9f45a 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -38,7 +38,7 @@ out:
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
-define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0
diff --git a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
index 941f4c601e3..25a700a943d 100644
--- a/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/llvm/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -13,7 +13,7 @@
 ; GCN: ScratchSize: 8{{$}}
 
 ; R600: MOV
-define void @foo() {
+define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/unroll.ll b/llvm/test/CodeGen/AMDGPU/unroll.ll
index 411a15a4b83..e171944c83b 100644
--- a/llvm/test/CodeGen/AMDGPU/unroll.ll
+++ b/llvm/test/CodeGen/AMDGPU/unroll.ll
@@ -9,7 +9,7 @@
 ; CHECK-LABEL: @test
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %0 = alloca [32 x i32]
   br label %loop.header
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
index d120111a71f..68e91e8c9c6 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -6,7 +6,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -18,7 +18,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -31,7 +31,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -43,7 +43,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -55,7 +55,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -67,7 +67,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -79,7 +79,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -92,7 +92,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -104,7 +104,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -116,7 +116,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
diff --git a/llvm/test/CodeGen/AMDGPU/urem.ll b/llvm/test/CodeGen/AMDGPU/urem.ll
index 9e2cfa34e0b..fd7f8fa2efa 100644
--- a/llvm/test/CodeGen/AMDGPU/urem.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem.ll
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = urem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -48,7 +48,7 @@ define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -60,7 +60,7 @@ define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -72,7 +72,7 @@ define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -84,7 +84,7 @@ define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 82bdc261b11..f8e6b7edfe3 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -11,7 +11,7 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
   %va0 = load volatile float, float addrspace(1)* %in
   %va1 = load volatile float, float addrspace(1)* %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
@@ -76,7 +76,7 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -100,7 +100,7 @@ define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, floa
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -110,7 +110,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -121,7 +121,7 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -132,7 +132,7 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, f
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -171,7 +171,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -186,7 +186,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -199,7 +199,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -234,7 +234,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -259,7 +259,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 9391eda00e7..d1f454f0bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -9,7 +9,7 @@
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -27,7 +27,7 @@ define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -42,7 +42,7 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -87,7 +87,7 @@ define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryou
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -104,7 +104,7 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -122,7 +122,7 @@ define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_usubo_i16:
 ; VI: v_subrev_u16_e32
 ; VI: v_cmp_gt_u16_e32
-define void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
index a48e7acd4cf..b7d766aa395 100644
--- a/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
+++ b/llvm/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
 ; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
index 78d534b9085..d4a68a418ee 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %f.gep
@@ -30,7 +30,7 @@ define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -77,7 +77,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -92,7 +92,7 @@ define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -107,7 +107,7 @@ define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -124,7 +124,7 @@ define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
 ; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
 ; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
-define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -142,7 +142,7 @@ define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -159,7 +159,7 @@ define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -178,7 +178,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -203,7 +203,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrs
 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
-define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -226,7 +226,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrs
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -249,7 +249,7 @@ define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -275,7 +275,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -298,7 +298,7 @@ define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_byte
-define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -321,7 +321,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspa
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -343,7 +343,7 @@ define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, flo
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -364,7 +364,7 @@ define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -386,7 +386,7 @@ define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 9246ce38dbe..2cda52a8438 100644
--- a/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg)
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
@@ -51,7 +51,7 @@ define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %re
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac.ll b/llvm/test/CodeGen/AMDGPU/v_mac.ll
index 290753c4acb..2b96f7d5007 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -37,7 +37,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
+define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -52,7 +52,7 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
+define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -65,7 +65,7 @@ entry:
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -96,7 +96,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -116,7 +116,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -136,7 +136,7 @@ entry:
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
-define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -156,7 +156,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -176,7 +176,7 @@ entry:
 ; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -196,7 +196,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src2:
 ; GCN-NOT: v_mac
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -222,7 +222,7 @@ entry:
 
 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -257,7 +257,7 @@ bb:
 
 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
index f5304bd5f10..6a112842815 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -14,7 +14,7 @@
 ; VI:  v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
 ; VI:  buffer_store_short v[[C_F16]]
 ; GCN: s_endpgm
-define void @mac_f16(
+define amdgpu_kernel void @mac_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -38,7 +38,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_same_add(
+define amdgpu_kernel void @mac_f16_same_add(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
@@ -73,7 +73,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a(
+define amdgpu_kernel void @mac_f16_neg_a(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -100,7 +100,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b(
+define amdgpu_kernel void @mac_f16_neg_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -127,7 +127,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c(
+define amdgpu_kernel void @mac_f16_neg_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -151,7 +151,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; GCN: s_endpgm
-define void @mac_f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -175,7 +175,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -199,7 +199,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -226,7 +226,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -253,7 +253,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -280,7 +280,7 @@ entry:
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -328,7 +328,7 @@ entry:
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @mac_v2f16(
+define amdgpu_kernel void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -355,7 +355,7 @@ entry:
 ; VI:  v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_same_add(
+define amdgpu_kernel void @mac_v2f16_same_add(
     <2 x half> addrspace(1)* %r0,
     <2 x half> addrspace(1)* %r1,
     <2 x half> addrspace(1)* %a,
@@ -392,7 +392,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a(
+define amdgpu_kernel void @mac_v2f16_neg_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -421,7 +421,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b(
+define amdgpu_kernel void @mac_v2f16_neg_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -454,7 +454,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c(
+define amdgpu_kernel void @mac_v2f16_neg_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -482,7 +482,7 @@ entry:
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -510,7 +510,7 @@ entry:
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -538,7 +538,7 @@ entry:
 ; VI:  v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -571,7 +571,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -604,7 +604,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -637,7 +637,7 @@ entry:
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c_nsz_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
index df87a94ca07..bfb10503aae 100644
--- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @madak_f16(
+define amdgpu_kernel void @madak_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,7 +28,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @madak_f16_use_2(
+define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
index e25589c29cf..41220ff8f87 100644
--- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll
@@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
-define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -76,7 +76,7 @@ end:
 ; SI: BB1_2:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
   br i1 %is.0, label %store, label %exit
@@ -106,7 +106,7 @@ exit:
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -173,7 +173,7 @@ exit:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
diff --git a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 03e473e3a0c..5e5465800c3 100644
--- a/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/llvm/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
-  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     %cmp0 = fcmp oeq float %cond, 0.000000e+00
     br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
index 7dcf36f144a..03cf725601b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -15,7 +15,7 @@
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -44,7 +44,7 @@ entry:
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -71,7 +71,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -93,7 +93,7 @@ entry:
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -121,7 +121,7 @@ entry:
 ; OPT: store float
 ; OPT: store float
 ; OPT: load float
-define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
diff --git a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 2d39f82e249..ab2bfcfd1fb 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -30,7 +30,7 @@ define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: v_movrels_b32
 ; GCN: buffer_store_dword v
-define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -49,7 +49,7 @@ define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -68,7 +68,7 @@ define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> ad
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
index 7c254abc642..90cf34e609f 100644
--- a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -12,7 +12,7 @@
 ; CHECK-DAG: ds_write2_b32
 ; CHECK-DAG: ds_write2_b32
 
-define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
 bb:
   %tmp = load i32, i32 addrspace(1)* %arg, align 4
   store i32 %tmp, i32 addrspace(3)* %arg1, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
index 3d71062f1fb..46a1c87184d 100644
--- a/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
+++ b/llvm/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -6,7 +6,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %v = load i32, i32 addrspace(1)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %v, <4 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
+define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
   %v = load i32, i32 addrspace(7)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
 
 @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
 
-define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
   %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
   %v = load i32, i32 addrspace(2)* %a
   store i32 %v, i32 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index f04cbcb4605..e82e548f23c 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -45,7 +45,7 @@
 ; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index 7825e36352f..8d66c346ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -4,7 +4,7 @@
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
index ae8ec58270c..d2708b068eb 100644
--- a/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/llvm/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -8,7 +8,7 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
@@ -35,7 +35,7 @@ endif:                                            ; preds = %else, %if
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
diff --git a/llvm/test/CodeGen/AMDGPU/vselect.ll b/llvm/test/CodeGen/AMDGPU/vselect.ll
index fe5be7526b1..bb6234729f9 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
   %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
   %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
@@ -28,7 +28,7 @@ entry:
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
 
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
   %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
@@ -52,7 +52,7 @@ entry:
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
   %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
   %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
@@ -68,7 +68,7 @@ entry:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
   %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
diff --git a/llvm/test/CodeGen/AMDGPU/vselect64.ll b/llvm/test/CodeGen/AMDGPU/vselect64.ll
index ef85ebe7899..4a043556516 100644
--- a/llvm/test/CodeGen/AMDGPU/vselect64.ll
+++ b/llvm/test/CodeGen/AMDGPU/vselect64.ll
@@ -5,7 +5,7 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
diff --git a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
index 4584d6e2525..4c5eb3d3aa5 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if
diff --git a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
index 912e258ebb8..c4b619bf168 100644
--- a/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
+++ b/llvm/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -9,7 +9,7 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
   %1 = load i32, i32 addrspace(1)* %0
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
index d29bae45d8c..5d86b12da95 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -9,7 +9,7 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   store volatile i32 0, i32 addrspace(1)* %out
   %val = load volatile i32, i32 addrspace(1)* %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
index afe651af093..38662e83b35 100644
--- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
 
 --- |
-  define void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
                                  <4 x i32> addrspace(1)* %global16,
                                  i32 addrspace(4)* %flat4,
                                  <4 x i32> addrspace(4)* %flat16) {
     ret void
   }
 
-  define void @single_fallthrough_successor_no_end_block_wait() {
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
     ret void
   }
 
-  define void @single_branch_successor_not_next_block() {
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
index deac809f9b0..b1ee016e99c 100644
--- a/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -10,7 +10,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 
-define void @write_vgpr_into_sgpr() {
+define amdgpu_kernel void @write_vgpr_into_sgpr() {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.write_register.i32(metadata !0, i32 %tid)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/write_register.ll b/llvm/test/CodeGen/AMDGPU/write_register.ll
index 88660ba6ec6..9c62e003dde 100644
--- a/llvm/test/CodeGen/AMDGPU/write_register.ll
+++ b/llvm/test/CodeGen/AMDGPU/write_register.ll
@@ -4,7 +4,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare void @llvm.write_register.i64(metadata, i64) #0
 
 ; CHECK-LABEL: {{^}}test_write_m0:
-define void @test_write_m0(i32 %val) #0 {
+define amdgpu_kernel void @test_write_m0(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !0, i32 0)
   call void @llvm.write_register.i32(metadata !0, i32 -1)
   call void @llvm.write_register.i32(metadata !0, i32 %val)
@@ -15,7 +15,7 @@ define void @test_write_m0(i32 %val) #0 {
 ; CHECK: s_mov_b64 exec, 0
 ; CHECK: s_mov_b64 exec, -1
 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_exec(i64 %val) #0 {
+define amdgpu_kernel void @test_write_exec(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !1, i64 0)
   call void @llvm.write_register.i64(metadata !1, i64 -1)
   call void @llvm.write_register.i64(metadata !1, i64 %val)
@@ -26,7 +26,7 @@ define void @test_write_exec(i64 %val) #0 {
 ; CHECK: s_mov_b64 flat_scratch, 0
 ; CHECK: s_mov_b64 flat_scratch, -1
 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_flat_scratch(i64 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !2, i64 0)
   call void @llvm.write_register.i64(metadata !2, i64 -1)
   call void @llvm.write_register.i64(metadata !2, i64 %val)
@@ -36,7 +36,7 @@ define void @test_write_flat_scratch(i64 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
 ; CHECK: s_mov_b32 flat_scratch_lo, 0
 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
-define void @test_write_flat_scratch_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !3, i32 0)
   call void @llvm.write_register.i32(metadata !3, i32 %val)
   ret void
@@ -45,7 +45,7 @@ define void @test_write_flat_scratch_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
 ; CHECK: s_mov_b32 flat_scratch_hi, 0
 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
-define void @test_write_flat_scratch_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !4, i32 0)
   call void @llvm.write_register.i32(metadata !4, i32 %val)
   ret void
@@ -54,7 +54,7 @@ define void @test_write_flat_scratch_hi(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_lo:
 ; CHECK: s_mov_b32 exec_lo, 0
 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
-define void @test_write_exec_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !5, i32 0)
   call void @llvm.write_register.i32(metadata !5, i32 %val)
   ret void
@@ -63,7 +63,7 @@ define void @test_write_exec_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_hi:
 ; CHECK: s_mov_b32 exec_hi, 0
 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
-define void @test_write_exec_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !6, i32 0)
   call void @llvm.write_register.i32(metadata !6, i32 %val)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 7f6b8045904..36532365d87 100644
--- a/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/llvm/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
diff --git a/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll b/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
index babae9ead27..88ef9fd93c8 100644
--- a/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
@@ -5,7 +5,7 @@
 
 ; TODO: enable doubles
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -14,7 +14,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -30,7 +30,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index bf02d4c3b31..57a082a0170 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -10,7 +10,7 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
@@ -29,7 +29,7 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
@@ -46,7 +46,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1) * %in0
   %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -63,7 +63,7 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
   %a = load volatile i1, i1 addrspace(1)* %in0
   %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
@@ -73,7 +73,7 @@ define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
@@ -83,7 +83,7 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
@@ -111,7 +111,7 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
@@ -122,7 +122,7 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -130,7 +130,7 @@ define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
@@ -153,7 +153,7 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -178,7 +178,7 @@ endif:
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -192,7 +192,7 @@ define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -211,7 +211,7 @@ define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i6
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@ define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -231,7 +231,7 @@ define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -243,7 +243,7 @@ define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 57209961760..f256d89f0cb 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -9,7 +9,7 @@
 ; SI: {{^}}s_mad_zext_i32_to_i64:
 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
@@ -20,7 +20,7 @@ entry:
 
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -39,7 +39,7 @@ define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
 ; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_u32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; SI: buffer_store_short [[RESULT]]
-define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 842c30b40df..a902234898c 100644
--- a/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/llvm/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -11,7 +11,7 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
@@ -31,7 +31,7 @@ define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0,
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
index 84d3baa4c9a..5da98fb9c2d 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
index 3277d37d7e4..7cef01c9d12 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -1,6 +1,6 @@
 # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
 --- |
-  define void @add_f32_1.0_one_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -11,7 +11,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -22,7 +22,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @add_i32_1_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f16.add0 = fadd half %f16.val0, 0xH0001
@@ -56,7 +56,7 @@
     ret void
   }
 
-  define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -69,7 +69,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_multi_f32_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
     %f32.val0 = load volatile float, float addrspace(1)* undef
     %f32.val1 = load volatile float, float addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
@@ -91,7 +91,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir b/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
index f43266eacbf..68950a4b251 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/intrinsics.mir
@@ -2,7 +2,7 @@
 
 --- |
 
-  define void @use_intrin() {
+  define amdgpu_kernel void @use_intrin() {
     ret void
   }
 
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
index fec7a5d7a38..8cffc86373a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
index 10c8128b3ce..32669de15ea 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -7,7 +7,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -15,7 +15,7 @@
     ret void
   }
 
-  define void @float2(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float2(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
index 6cec253bbf9..2bcb3a9d1e3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
@@ -5,7 +5,7 @@
 ; ASC-NOT: ptrtoint
 ; ASC-NOT: inttoptr
 
-define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll b/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
index aa4fb8e68eb..36c7bd9c5ec 100644
--- a/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @indvar_32_bit(
 ; CHECK-NOT: sext i32
 ; CHECK: phi i32
-define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
@@ -46,7 +46,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK-NOT: ashr i64
 ; CHECK-NOT: mul nsw i64
 ; CHECK-NOT: add nsw i64
-define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   br label %for.body
 
@@ -72,7 +72,7 @@ for.end:
 ; be legalized anyway.
 
 ; CHECK-LABEL: @indvar_48_bit(
-define void @indvar_48_bit(i48 %n, i48* nocapture %output) {
+define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i48 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index 67b4ccda1a1..b566c147e9b 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -45,7 +45,7 @@ define float @load_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_global_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
-define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
   store float 0.0, float addrspace(1)* %tmp0
   ret void
@@ -54,7 +54,7 @@ define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_group_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
-define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
   store float 0.0, float addrspace(3)* %tmp0
   ret void
@@ -63,7 +63,7 @@ define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_private_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
 ; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
-define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
   store float 0.0, float* %tmp0
   ret void
@@ -74,7 +74,7 @@ define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -87,7 +87,7 @@ define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace
 ; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -100,7 +100,7 @@ define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(
 ; CHECK-NEXT: %val = load i32, i32* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -113,7 +113,7 @@ define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #
 ; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
   %val = load i32, i32 addrspace(4)* %input, align 4
   store i32 %val, i32 addrspace(4)* %output, align 4
   ret void
@@ -122,7 +122,7 @@ define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4
 ; CHECK-LABEL: @store_addrspacecast_ptr_value(
 ; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
 ; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
-define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
   %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
   ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index aad9db63269..52067cd37bb 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -28,7 +28,7 @@
 ; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: ret void
-define void @load_store_lds_f32(i32 %i, float %v) #0 {
+define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
 bb:
   %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
   call void @use(float %tmp)
@@ -83,7 +83,7 @@ bb:
 
 ; CHECK-LABEL: @nested_const_expr(
 ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
-define void @nested_const_expr() #0 {
+define amdgpu_kernel void @nested_const_expr() #0 {
   store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
   ret void
 }
@@ -93,7 +93,7 @@ define void @nested_const_expr() #0 {
 ; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
 ; CHECK-NEXT: store float %v, float addrspace(1)* %addr
 ; CHECK-NEXT: ret void
-define void @rauw(float addrspace(1)* %input) #0 {
+define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 {
 bb:
   %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
   %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
@@ -117,7 +117,7 @@ bb:
 ; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end
 
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop() #0 {
+define amdgpu_kernel void @loop() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = getelementptr float, float addrspace(4)* %p, i64 10
@@ -150,7 +150,7 @@ exit:                                             ; preds = %loop
 ; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
 ; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop_with_generic_bound() #0 {
+define amdgpu_kernel void @loop_with_generic_bound() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
index afd1493fc0e..557a80f1a5d 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: @memset_group_to_flat(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -10,7 +10,7 @@ define void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
 
 ; CHECK-LABEL: @memset_global_to_flat(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -18,7 +18,7 @@ define void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
 
 ; CHECK-LABEL: @memset_group_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -26,7 +26,7 @@ define void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size)
 
 ; CHECK-LABEL: @memset_global_to_flat_no_md(
 ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
-define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
   ret void
@@ -34,7 +34,7 @@ define void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group(
 ; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -42,7 +42,7 @@ define void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest,
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -50,7 +50,7 @@ define void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -59,7 +59,7 @@ define void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %d
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global(
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)*
   %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
@@ -68,7 +68,7 @@ define void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)*
 
 ; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global(
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
+define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
   %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
@@ -76,7 +76,7 @@ define void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.glo
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7
-define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7
   ret void
@@ -84,7 +84,7 @@ define void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace
 
 ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   ret void
@@ -93,7 +93,7 @@ define void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %
 ; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
 ; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
-define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
@@ -103,14 +103,14 @@ define void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrsp
 ; Check for iterator problems if the pointer has 2 uses in the same call
 ; CHECK-LABEL: @memcpy_group_flat_to_flat_self(
 ; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
+define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
 }
 ; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
 ; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
-define void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
   %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
   call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
   ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
index 17997052f07..3231b6ccf1c 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: @generic_address_bitcast_const(
 ; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
-define void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
@@ -39,7 +39,7 @@ declare i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)*)
 ; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
 ; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
 ; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
-define void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
@@ -55,7 +55,7 @@ entry:
 ; CHECK: br i1
 ; CHECK: load float, float addrspace(4)*
 ; CHECK: br label
-define void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %ptr = alloca float addrspace(4)*, align 8
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -85,7 +85,7 @@ helperFunction.exit:                              ; preds = %if.end.i, %entry
 ; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
 ; CHECK: phi i32 addrspace(3)*
 ; CHECK: store i32 %i.03, i32 addrspace(3)* %
-define void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
 entry:
   %cmp1 = icmp eq i32 %numElems, 0
   br i1 %cmp1, label %for.end, label %for.body.lr.ph
@@ -110,7 +110,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK-LABEL: @generic_address_bug9899(
 ; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
-define void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
index bcbca16d7af..08edc20ecf9 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -18,7 +18,7 @@ define i32 addrspace(4)* @return_select_group_flat(i1 %c, i32 addrspace(3)* %gro
 ; CHECK-LABEL: @store_select_group_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -43,7 +43,7 @@ define i32 @load_select_group_flat_md(i1 %c, i32 addrspace(3)* %group.ptr.0, i32
 ; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2
 ; CHECK: store i32 -1, i32 addrspace(4)* %select
-define void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
+define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
@@ -73,7 +73,7 @@ bb:
 ; CHECK-LABEL: @store_select_group_flat_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   store i32 -1, i32 addrspace(4)* %select
@@ -83,7 +83,7 @@ define void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0)
 ; CHECK-LABEL: @store_select_group_flat_null_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -93,7 +93,7 @@ define void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.p
 ; CHECK-LABEL: @store_select_group_flat_undef(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef
   store i32 -1, i32 addrspace(4)* %select
@@ -103,7 +103,7 @@ define void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0
 ; CHECK-LABEL: @store_select_group_flat_undef_swap(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0
 ; CHECK: store i32 -1, i32 addrspace(3)* %select
-define void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0
   store i32 -1, i32 addrspace(4)* %select
@@ -114,7 +114,7 @@ define void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
 ; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16
 ; CHECK: store i32 -1, i32 addrspace(3)* %gep
-define void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
   %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16
@@ -127,7 +127,7 @@ define void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.pt
 ; CHECK-LABEL: @store_select_group_flat_constexpr(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -137,7 +137,7 @@ define void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.p
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_flat(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*)
 ; CHECK: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -147,7 +147,7 @@ define void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %gro
 ; CHECK-LABEL: @store_select_group_flat_inttoptr_group(
 ; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*)
 ; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select
-define void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -158,7 +158,7 @@ define void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %gr
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -169,7 +169,7 @@ define void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrsp
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0
   store i32 7, i32 addrspace(4)* %select
@@ -179,7 +179,7 @@ define void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 a
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null(
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
   %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
   ret void
@@ -187,42 +187,42 @@ define void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
-define void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr(
 ; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4
-define void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
   ret void
 }
 
 ; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
 ; CHECK: store i32 7, i32 addrspace(3)* null
-define void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
+define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
   store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4
   ret void
 }
@@ -233,7 +233,7 @@ define void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
 ; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
 ; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
 ; CHECK: store i32 7, i32 addrspace(4)* %select
-define void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
   %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
   %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
   store i32 7, i32 addrspace(4)* %select
@@ -248,7 +248,7 @@ define void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %gro
 ; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
 ; CHECK: store i32 -1, i32 addrspace(4)* %extract0
 ; CHECK: store i32 -2, i32 addrspace(4)* %extract1
-define void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
   %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
   %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
   %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
index d9b80e99bf0..79bf92610a8 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_global(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -16,7 +16,7 @@ define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input,
 ; CHECK-LABEL: @volatile_load_flat_from_constant(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -27,7 +27,7 @@ define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input
 ; CHECK-LABEL: @volatile_load_flat_from_group(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(3)*
-define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -38,7 +38,7 @@ define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i
 ; CHECK-LABEL: @volatile_load_flat_from_private(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32*
-define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -49,7 +49,7 @@ define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocaptu
 ; CHECK-LABEL: @volatile_store_flat_to_global(
 ; CHECK: load i32, i32 addrspace(1)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -60,7 +60,7 @@ define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i
 ; CHECK-LABEL: @volatile_store_flat_to_group(
 ; CHECK: load i32, i32 addrspace(3)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -71,7 +71,7 @@ define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i3
 ; CHECK-LABEL: @volatile_store_flat_to_private(
 ; CHECK: load i32, i32*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -119,7 +119,7 @@ define { i32, i1 } @volatile_cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr,
 ; CHECK-LABEL: @volatile_memset_group_to_flat(
 ; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void
@@ -128,7 +128,7 @@ define void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y)
 ; CHECK-LABEL: @volatile_memset_global_to_flat(
 ; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
 ; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
-define void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
   %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
   call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
   ret void
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index e6904ee50bc..4b2dab47a20 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; NOSCOPE: load float
 ; NOSCOPE: store float
 ; NOSCOPE: store float
-define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index d8f72a8e1df..368dc6ab361 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -10,7 +10,7 @@ target triple = "amdgcn--"
 
 ; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
 ; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   %val0 = load i8, i8* %ptr0, align 1
@@ -27,7 +27,7 @@ define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %o
 
 ; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
 ; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   %val0 = load i16, i16* %ptr0, align 1
@@ -47,7 +47,7 @@ define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: load i32, i32* %ptr0, align 1
 ; ALIGNED: load i32, i32* %ptr1, align 1
-define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -68,7 +68,7 @@ define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32
 ; FIXME: Should change alignment
 ; ALIGNED: load i32
 ; ALIGNED: load i32
-define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -85,7 +85,7 @@ define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %
 
 ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
 ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   store i8 9, i8* %ptr0, align 1
@@ -100,7 +100,7 @@ define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %
 
 ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
 ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   store i16 9, i16* %ptr0, align 1
@@ -119,7 +119,7 @@ define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: store i32 9, i32* %ptr0, align 1
 ; ALIGNED: store i32 10, i32* %ptr1, align 1
-define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   store i32 9, i32* %ptr0, align 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 25abb98c6eb..8a75b8743fa 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: sext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float> zeroinitializer
-define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %sext.id.x = sext i32 %id.x to i64
@@ -32,7 +32,7 @@ entry:
 ; CHECK: zext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %zext.id.x = zext i32 %id.x to i64
@@ -54,7 +54,7 @@ entry:
 ; CHECK-LABEL: @merge_op_zext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -81,7 +81,7 @@ entry:
 ; CHECK-LABEL: @merge_op_sext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -112,7 +112,7 @@ entry:
 ; CHECK: loop:
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
 entry:
   %cmp0 = icmp eq i32 %n, 0
   br i1 %cmp0, label %exit, label %loop
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index 2b2f9cbcf50..6182c09abcf 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: load <2 x float>
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
@@ -38,7 +38,7 @@ entry:
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: store <2 x float>
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index 4d6240a9aa9..3f6d7ee7dca 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
 ; CHECK: load double
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index c85be874376..0fcdc7b9083 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -17,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; ELT8-UNALIGNED: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -44,7 +44,7 @@ define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -71,7 +71,7 @@ define void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
 ; ELT4-UNALIGNED: store i32
-define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -85,7 +85,7 @@ define void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -104,7 +104,7 @@ define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -118,7 +118,7 @@ define void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 4
@@ -131,7 +131,7 @@ define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
-define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 2
@@ -144,7 +144,7 @@ define void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0
 ; ALIGNED: store i16
 
 ; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
-define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 1
@@ -154,7 +154,7 @@ define void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0
 
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
 ; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
-define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 8
@@ -179,7 +179,7 @@ define void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0
 ; ELT16-ALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -202,7 +202,7 @@ define void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
 ; ELT8-UNALIGNED: store i32
 
 ; ELT16-UNALIGNED: store <3 x i32>
-define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
 
@@ -218,7 +218,7 @@ define void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0
 ; ALIGNED: store i8
 
 ; UNALIGNED: store <3 x i8>
-define void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i8 1
   %out.gep.2 = getelementptr i8, i8* %out, i8 2
 
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index d32387fa2c0..dbb7068eeae 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -20,7 +20,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
 ; CHECK: store <2 x i8>
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -30,7 +30,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16
 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -40,7 +40,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -50,7 +50,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
 ; CHECK: store <2 x i16>
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -60,7 +60,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
 ; CHECK: store <2 x half>
-define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 
   store half 2.0, half addrspace(1)* %out.gep.1
@@ -70,7 +70,7 @@ define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -80,7 +80,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -90,7 +90,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -100,7 +100,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 
 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -114,7 +114,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -129,7 +129,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -143,7 +143,7 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -160,7 +160,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -172,7 +172,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -183,7 +183,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -202,7 +202,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -220,7 +220,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: store <2 x i32>
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -279,7 +279,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32>
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -298,7 +298,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 ; CHECK: load <4 x float>
 ; CHECK: store <4 x float>
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -321,7 +321,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -346,7 +346,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -373,7 +373,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -408,7 +408,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK: insertelement <4 x i8>
 ; CHECK: insertelement <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -431,7 +431,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 ; CHECK: load <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +454,7 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -474,7 +474,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -484,7 +484,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -495,7 +495,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 ; CHECK: store i32
 ; CHECK: store i32
-define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
@@ -506,7 +506,7 @@ define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #
 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)*
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -521,7 +521,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store i32
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -537,7 +537,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -555,7 +555,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -597,7 +597,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @copy_v3i32_align4
 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -606,7 +606,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; CHECK-LABEL: @copy_v3i64_align4
 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -615,7 +615,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; CHECK-LABEL: @copy_v3f32_align4
 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 ; CHECK: store <3 x float>
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; CHECK-LABEL: @copy_v3f64_align4
 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 8885d61014f..226147df66a 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32> zeroinitializer
-define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK-LABEL: @merge_v1i32_v1i32(
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
@@ -41,7 +41,7 @@ entry:
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32> zeroinitializer
 ; CHECK: store <3 x i32> zeroinitializer
-define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
@@ -58,7 +58,7 @@ entry:
 ; CHECK-LABEL: @merge_v2i16_v2i16(
 ; CHECK: load <4 x i16>
 ; CHECK: store <4 x i16> zeroinitializer
-define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK-LABEL: @merge_load_i32_v2i16(
 ; CHECK: load i32,
 ; CHECK: load <2 x i16>
-define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index ba792f78353..f353106607d 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -7,7 +7,7 @@
 
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
 ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
   %val0 = load float, float addrspace(3)* %ptr0
 
@@ -21,7 +21,7 @@ define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
 
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
 ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @store_keep_base_alignment_missing_align() {
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
   store float 0.0, float addrspace(3)* %arrayidx0
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 88eca363902..8a78f3d7e9b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
 ; CHECK: store i32 0
 ; CHECK: store i32 0
 
-define void @no_crash(i32 %arg) {
+define amdgpu_kernel void @no_crash(i32 %arg) {
   %tmp2 = add i32 %arg, 14
   %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
   %tmp4 = add i32 %arg, 15
@@ -37,7 +37,7 @@ define void @no_crash(i32 %arg) {
 ; CHECK: load i32
 ; CHECK: load i32
 
-define void @interleave_get_longest(i32 %arg) {
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
   %a2 = add i32 %arg, 2
   %a3 = add i32 %arg, 3
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
index 4a429533df0..818189565b4 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
@@ -5,7 +5,7 @@
 ; CHECK: store i32
 ; CHECK: store i32
 ; CHECK: store i32
-define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
index 141e20a1f83..28d29f8e813 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
-define void @optnone(i32 addrspace(1)* %out) noinline optnone {
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -13,7 +13,7 @@ define void @optnone(i32 addrspace(1)* %out) noinline optnone {
 
 ; CHECK-LABEL: @do_opt(
 ; CHECK: store <2 x i32>
-define void @do_opt(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 202e988ea5f..65200b95d5e 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: store <2 x i64> zeroinitializer
-define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
@@ -28,7 +28,7 @@ entry:
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
@@ -46,7 +46,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -61,7 +61,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -92,7 +92,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
@@ -107,7 +107,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
@@ -122,7 +122,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -137,7 +137,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: store <2 x i32>
-define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -152,7 +152,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32>
-define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
@@ -166,7 +166,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_ptr32_i64(
 ; CHECK: store i8 addrspace(3)*
 ; CHECK: store i64
-define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -181,7 +181,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_i64_ptr32(
 ; CHECK: store i64
 ; CHECK: store i8 addrspace(3)*
-define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
@@ -195,7 +195,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
 ; CHECK: load i64,
 ; CHECK: load i8 addrspace(3)*,
-define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
@@ -209,7 +209,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_ptr32_i64(
 ; CHECK: load i8 addrspace(3)*,
 ; CHECK: load i64,
-define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -226,7 +226,7 @@ entry:
 ; CHECK: load <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
-define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
@@ -245,7 +245,7 @@ entry:
 ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
 ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: bitcast i64 [[ELT1_INT]] to double
-define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -262,7 +262,7 @@ entry:
 ; CHECK: bitcast i64 [[ELT0]] to double
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -279,7 +279,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -296,7 +296,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
index d70c449e14d..63e688e63fb 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
@@ -9,7 +9,7 @@
 ; CHECK: store <4 x float>
 
 ; Function Attrs: nounwind
-define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
 bb:
   %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
   %tmp1 = load float, float addrspace(1)* %tmp, align 4
diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
index 18f62be27c8..412d2013f6b 100644
--- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
@@ -16,7 +16,7 @@ declare void @use_v2i9(<2 x i9>)
 ; CHECK-LABEL: @merge_store_2_constants_i1(
 ; CHECK: store i1
 ; CHECK: store i1
-define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.gep.1
   store i1 false, i1 addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i2(
 ; CHECK: store i2 1
 ; CHECK: store i2 -1
-define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   store i2 1, i2 addrspace(1)* %out.gep.1
   store i2 -1, i2 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
 ; CHECK: store i1 true
 ; CHECK: store i8 123
-define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.i1
@@ -47,7 +47,7 @@ define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
 ; CHECK: store i8 123
 ; CHECK: store i1 true
-define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -58,7 +58,7 @@ define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constant_structs(
 ; CHECK: store %struct.foo
 ; CHECK: store %struct.foo
-define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
   store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v2i2(
 ; CHECK: store <2 x i2>
 ; CHECK: store <2 x i2>
-define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
   store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v4i2(
 ; CHECK: store <4 x i2>
 ; CHECK: store <4 x i2>
-define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
   store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i1(
 ; CHECK: load i1
 ; CHECK: load i1
-define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.gep.1
   %y = load i1, i1 addrspace(1)* %out
@@ -103,7 +103,7 @@ define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i2(
 ; CHECK: load i2
 ; CHECK: load i2
-define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   %x = load i2, i2 addrspace(1)* %out.gep.1
   %y = load i2, i2 addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
 ; CHECK: load i1
 ; CHECK: load i8
-define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.i1
@@ -128,7 +128,7 @@ define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
 ; CHECK: load i8
 ; CHECK: load i1
-define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   %x = load i8, i8 addrspace(1)* %out.gep.1
@@ -141,7 +141,7 @@ define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constant_structs(
 ; CHECK: load %struct.foo
 ; CHECK: load %struct.foo
-define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
   %y = load %struct.foo, %struct.foo addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i2(
 ; CHECK: load <2 x i2>
 ; CHECK: load <2 x i2>
-define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
   %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
@@ -165,7 +165,7 @@ define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v4i2(
 ; CHECK: load <4 x i2>
 ; CHECK: load <4 x i2>
-define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
   %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i9(
 ; CHECK: store i9 3
 ; CHECK: store i9 -5
-define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
   store i9 3, i9 addrspace(1)* %out.gep.1
   store i9 -5, i9 addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i9(
 ; CHECK: load <2 x i9>
 ; CHECK: load <2 x i9>
-define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
   %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
   %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
index ccad351f66f..054c61d1879 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -17,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
 ; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
 ; OPT: br i1 %exitcond
-define void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -54,7 +54,7 @@ bb:
 ; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
 ; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
-define void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index bf61112a3c3..c5ea1b915d9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
-define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -48,7 +48,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
-define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -83,7 +83,7 @@ bb:
 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
-define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -122,7 +122,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
-define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
index 89b62632cac..02c3c05e794 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
@@ -15,7 +15,7 @@ target triple = "amdgcn--"
 ;CHECK: buffer_store_dword
 ;CHECK: s_branch [[LOOP_LABEL]]
 
-define void @foo() {
+define amdgpu_kernel void @foo() {
 entry:
   br label %loop
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
index 8c83df5843d..67b1926bdf2 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
@@ -16,7 +16,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: bb:
 ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
 ; CHECK: %c1 = icmp ne i8 addrspace(3)*
-define void @local_cmp_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -47,7 +47,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
 ; CHECK: icmp ne i8 addrspace(1)* %t
-define void @global_cmp_user(i64 %arg0) nounwind {
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -78,7 +78,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
-define void @global_gep_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -108,7 +108,7 @@ bb13:
 
 ; CHECK: bb
 ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
-define void @global_sext_scale_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index b3b696d42c5..9eba0c3051d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
-define void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
 bb:
   br label %bb1
 
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
index e732ddc2bc8..ca8cc32469d 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK-NOT: br
-define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
index 915a13d4961..e986c3dc2a2 100644
--- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
+++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -7,7 +7,7 @@
 ; CHECK:       store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
 ; CHECK:       ret void
 
-define void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -40,7 +40,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK:       br i1 %[[exitcond]]
 ; CHECK-NOT:   icmp eq i32 %{{.*}}, 100
 
-define void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [64 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -82,7 +82,7 @@ for.body6:                                        ; preds = %for.body6, %for.con
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
+define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
 entry:
   %arr = alloca [256 x i32], align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -116,7 +116,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK:       icmp eq i32 %{{.*}}, 100
 ; CHECK:       br
 
-define void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
+define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
 entry:
   %arr = alloca i32, i32 %n, align 4
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
index e70467a9963..1f106bd894a 100644
--- a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -3,14 +3,14 @@
 ; Check that loop unswitch happened and condition hoisted out of the loop.
 ; Condition is uniform so all targets should perform unswitching.
 
-; CHECK-LABEL: {{^}}define void @uniform_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
 ; CHECK: entry:
 ; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
 ; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
 ; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
 ; CHECK-NEXT: br i1
 
-define void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
@@ -42,14 +42,14 @@ for.inc:                                          ; preds = %for.body, %if.then
 
 ; Check that loop unswitch does not happen if condition is divergent.
 
-; CHECK-LABEL: {{^}}define void @divergent_unswitch
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch
 ; CHECK: entry:
 ; CHECK: icmp
 ; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
 ; CHECK: br label
 ; CHECK: br i1 [[IF_COND]]
 
-define void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
+define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
index 85ba95cab16..f303ed5377e 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: ret
-define void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
 entry:
   %0 = icmp sgt i32 %size, 0
   br i1 %0, label %loop, label %exit
diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
index 35763953911..63c6d77954d 100644
--- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 
 
 ; Simple 3-pair chain with loads and stores
-define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_3_3_3(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
@@ -29,7 +29,7 @@ define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, do
   ret void
 }
 
-define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test1_as_3_0_0(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double>*
@@ -49,7 +49,7 @@ define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
   ret void
 }
 
-define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_0_0_3(
 ; CHECK: load <2 x double>, <2 x double>*
 ; CHECK: load <2 x double>, <2 x double>*
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 5815ae62737..23ec0ca2554 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
-define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
@@ -42,7 +42,7 @@ define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output)
 ; IR: add i32 %x, 256
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -74,7 +74,7 @@ define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
-define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index f2853aca698..9554ae69031 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset(
 ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]]
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]]
-define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1023
@@ -33,7 +33,7 @@ bb:
 ; CHECK: %tmp = sext i32 %j1 to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5
-define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1024
@@ -61,7 +61,7 @@ bb:
 
 ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383
-define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16383
@@ -86,7 +86,7 @@ bb:
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1
 ; CHECK: %j2 = add i32 %j1, %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2
-define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16384
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-03-21 21:39:51 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-03-21 21:39:51 +0000
commit	3dbeefa978fb7e7b231b249f9cd90c67b9e83277 (patch)
tree	d74bf7fe30e44588d573919f3625edacb2586112 /llvm/test
parent	f6021ecddc73d14c94ad70938250d58f330795be (diff)
download	bcm5719-llvm-3dbeefa978fb7e7b231b249f9cd90c67b9e83277.tar.gz bcm5719-llvm-3dbeefa978fb7e7b231b249f9cd90c67b9e83277.zip