[AMDGPU] Add support for TFE/LWE in image intrinsics. 2nd try

TFE and LWE support requires extra result registers that are written in the event of a failure in order to detect that failure case. The specific use-case that initiated these changes is sparse texture support. This means that if image intrinsics are used with either option turned on, the programmer must ensure that the return type can contain all of the expected results. This can result in redundant registers since the vector size must be a power-of-2. This change takes roughly 6 parts: 1. Modify the instruction defs in tablegen to add new instruction variants that can accomodate the extra return values. 2. Updates to lowerImage in SIISelLowering.cpp to accomodate setting TFE or LWE (where the bulk of the work for these instruction types is now done) 3. Extra verification code to catch cases where intrinsics have been used but insufficient return registers are used. 4. Modification to the adjustWritemask optimisation to account for TFE/LWE being enabled (requires extra registers to be maintained for error return value). 5. An extra pass to zero initialize the error value return - this is because if the error does not occur, the register is not written and thus must be zeroed before use. Also added a new (on by default) option to ensure ALL return values are zero-initialized that is required for sparse texture support. 6. Disable the inst_combine optimization in the presence of tfe/lwe (later TODO for this to re-enable and handle correctly). There's an additional fix now to avoid a dmask=0 For an image intrinsic with tfe where all result channels except tfe were unused, I was getting an image instruction with dmask=0 and only a single vgpr result for tfe. That is incorrect because the hardware assumes there is at least one vgpr result, plus the one for tfe. Fixed by forcing dmask to 1, which gives the desired two vgpr result with tfe in the second one. The TFE or LWE result is returned from the intrinsics using an aggregate type. Look in the test code provided to see how this works, but in essence IR code to invoke the intrinsic looks as follows: %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0) %v.vec = extractvalue {<4 x float>, i32} %v, 0 %v.err = extractvalue {<4 x float>, i32} %v, 1 This re-submit of the change also includes a slight modification in SIISelLowering.cpp to work-around a compiler bug for the powerpc_le platform that caused a buildbot failure on a previous submission. Differential revision: https://reviews.llvm.org/D48826 Change-Id: If222bc03642e76cf98059a6bef5d5bffeda38dda Work around for ppcle compiler bug Change-Id: Ie284cf24b2271215be1b9dc95b485fd15000e32b llvm-svn: 351054
author: David Stuttard <david.stuttard@amd.com> 2019-01-14 11:55:24 +0000
committer: David Stuttard <david.stuttard@amd.com> 2019-01-14 11:55:24 +0000
commit: f77079f892548efa3b34c16233d8779d25d92f58 (patch)
tree: 1c3da48a6063defe55051d3955d1a524108ae682 /llvm/test
parent: d1986d1b5a671bb73481f595a9d7e130b11f0d55 (diff)
download: bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.tar.gz
bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.zip
6 files changed, 708 insertions, 16 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
index bf93ffa937a..b297acab36c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s
+; RUN: llc -march=amdgcn -mcpu=fiji  -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s
 
 ; GCN-LABEL: {{^}}load_1d:
 ; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
@@ -10,6 +11,52 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_1d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_2d:
 ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
@@ -18,6 +65,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_3d:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
@@ -26,6 +96,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_3d_tfe_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_cube:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@@ -34,6 +127,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_cube_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_1darray:
 ; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
@@ -42,6 +158,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_1darray_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_2darray:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
@@ -50,6 +189,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2darray_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_2dmsaa:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
@@ -58,6 +220,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2dmsaa_both:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_2darraymsaa:
 ; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
 define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
@@ -66,6 +251,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_mip_1d:
 ; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
@@ -74,6 +282,29 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_mip_1d_lwe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}load_mip_2d:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
@@ -82,6 +313,191 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}load_mip_2d_tfe:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v4, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT-NOT: v_mov_b32_e32 v3
+; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
+; SIVI: buffer_store_dword v4, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
+define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; Make sure that error flag is returned even with dmask 0
+; GCN-LABEL: {{^}}load_1d_V2_tfe_dmask0:
+; GCN: v_mov_b32_e32 v1, 0
+; PRT-DAG: v_mov_b32_e32 v2, v1
+; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_1d_V1_tfe_dmask0:
+; GCN: v_mov_b32_e32 v1, 0
+; PRT-DAG: v_mov_b32_e32 v2, v1
+; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {float, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_dmask0:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; Do not make dmask 0 even if no result (other than tfe) is used.
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V2:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V1:
+; GCN: v_mov_b32_e32 v3, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
+; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
+; NOPRT-NOT: v_mov_b32_e32 v2
+; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
+define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
+main_body:
+  %v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.err = extractvalue {float, i32} %v, 1
+  %vv = bitcast i32 %v.err to float
+  ret float %vv
+}
+
+; Check for dmask being materially smaller than return type
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v3, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; NOPRT-NOT: v_mov_b32_e32 v2
+; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}}
+; SIVI: buffer_store_dword v3, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v2, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; NOPRT-NOT: v_mov_b32_e32 v1
+; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
+; SIVI: buffer_store_dword v2, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v1, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
+; SIVI: buffer_store_dword v1, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
+define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1:
+; PRT: v_mov_b32_e32 v0, 0
+; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
+; NOPRT: v_mov_b32_e32 v1, 0
+; NOPRT-NOT: v_mov_b32_e32 v0
+; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
+; SIVI: buffer_store_dword v1, off, s[8:11], 0
+; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
+define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
+main_body:
+  %v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
+  %v.vec = extractvalue {<2 x float>, i32} %v, 0
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <2 x float> %v.vec
+}
+
+
 ; GCN-LABEL: {{^}}load_mip_3d:
 ; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
@@ -404,23 +820,37 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
   store float 0.000000e+00, float addrspace(3)* %lds
   %c0 = extractelement <2 x i32> %c, i32 0
   %c1 = extractelement <2 x i32> %c, i32 1
-  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
+  %tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
   store float 0.000000e+00, float addrspace(3)* %tmp2
   ret float %tex
 }
 
 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 
 declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
+declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
index 1fbfccb0e39..fd2c6e796c8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.d16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
 
 ; GCN-LABEL: {{^}}load.f16.1d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
 define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -10,7 +10,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f16.1d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
 define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -37,7 +37,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.f16.2d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
 define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -47,7 +47,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f16.2d:
-; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
 define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -77,7 +77,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.f16.3d:
-; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 d16
 define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -88,7 +88,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f16.3d:
-; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm a16 d16
 define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
index d857ae115a7..be579b84eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.load.a16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
 
 ; GCN-LABEL: {{^}}load.f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
 define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -10,7 +10,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f32.1d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
 define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -37,7 +37,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
 define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -47,7 +47,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f32.2d:
-; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
 define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
 main_body:
   %x = extractelement <2 x i16> %coords, i32 0
@@ -77,7 +77,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
+; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16
 define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
@@ -88,7 +88,7 @@ main_body:
 }
 
 ; GCN-LABEL: {{^}}load.v2f32.3d:
-; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
+; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16
 define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
 main_body:
   %x = extractelement <2 x i16> %coords_lo, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
index 9619304e1aa..b6260f4af83 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll
@@ -10,6 +10,19 @@ main_body:
   ret half %tex
 }
 
+; GCN-LABEL: {{^}}image_sample_2d_f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; PACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+; UNPACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
+define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
+main_body:
+  %tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {half, i32} %tex, 0
+  %tex.err = extractvalue {half, i32} %tex, 1
+  store i32 %tex.err, i32 addrspace(1)* %out, align 4
+  ret half %tex.vec
+}
+
 ; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
 ; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
 ; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
@@ -20,6 +33,22 @@ main_body:
   ret float %r
 }
 
+; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
+define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
+main_body:
+  %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {<2 x half>, i32} %tex, 0
+  %tex.err = extractvalue {<2 x half>, i32} %tex, 1
+  %tex.vecf = bitcast <2 x half> %tex.vec to float
+  %r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
+  %tex.errf = bitcast i32 %tex.err to float
+  %r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
+  ret <2 x float> %r
+}
+
 ; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
 ; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
 ; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
@@ -30,9 +59,33 @@ main_body:
   ret <2 x float> %r
 }
 
+; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
+; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
+define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
+main_body:
+  %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
+  %tex.vec = extractvalue {<4 x half>, i32} %tex, 0
+  %tex.err = extractvalue {<4 x half>, i32} %tex, 1
+  %tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
+  %tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
+  %tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
+  %r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
+  %r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
+  %tex.errf = bitcast i32 %tex.err to float
+  %r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
+  ret <4 x float> %r
+}
+
 declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
index 65f4b46d0ae..2ee69ac6e8b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll
@@ -9,6 +9,162 @@ main_body:
   ret <4 x float> %v
 }
 
+; GCN-LABEL: {{^}}sample_1d_tfe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 0
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 1
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 2
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}}
+define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
+  %res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
+  ret <2 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 0
+  %res.f2 = extractelement <4 x float> %res.vec, i32 1
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 1
+  %res.f2 = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %res.vec = extractvalue {<4 x float>,i32} %v, 0
+  %res.f1 = extractelement <4 x float> %res.vec, i32 0
+  %res.f2 = extractelement <4 x float> %res.vec, i32 2
+  %res.f3 = extractelement <4 x float> %res.vec, i32 3
+  %res.err = extractvalue {<4 x float>,i32} %v, 1
+  %res.errf = bitcast i32 %res.err to float
+  %res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
+  %res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
+  %res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2
+  %res = insertelement <4 x float> %res.tmp3, float %res.errf, i32 3
+  ret <4 x float> %res
+}
+
+; GCN-LABEL: {{^}}sample_1d_lwe:
+; GCN: v_mov_b32_e32 v0, 0
+; GCN: v_mov_b32_e32 v1, v0
+; GCN: v_mov_b32_e32 v2, v0
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
+define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
+main_body:
+  %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
+  %v.vec = extractvalue {<4 x float>, i32} %v, 0
+  %v.err = extractvalue {<4 x float>, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret <4 x float> %v.vec
+}
+
 ; GCN-LABEL: {{^}}sample_2d:
 ; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
 define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@@ -361,6 +517,17 @@ main_body:
   ret float %v
 }
 
+; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1_tfe:
+; GCN: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da{{$}}
+define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
+main_body:
+  %v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {float, i32} %v, 0
+  %v.err = extractvalue {float, i32} %v, 1
+  store i32 %v.err, i32 addrspace(1)* %out, align 4
+  ret float %v.vec
+}
+
 ; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2:
 ; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}}
 define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
@@ -369,6 +536,22 @@ main_body:
   ret <2 x float> %v
 }
 
+; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
+; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
+define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
+main_body:
+  %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
+  %v.vec = extractvalue {<2 x float>, i32} %v, 0
+  %v.f1 = extractelement <2 x float> %v.vec, i32 0
+  %v.f2 = extractelement <2 x float> %v.vec, i32 1
+  %v.err = extractvalue {<2 x float>, i32} %v, 1
+  %v.errf = bitcast i32 %v.err to float
+  %res.0 = insertelement <4 x float> undef, float %v.f1, i32 0
+  %res.1 = insertelement <4 x float> %res.0, float %v.f2, i32 1
+  %res.2 = insertelement <4 x float> %res.1, float %v.errf, i32 2
+  ret <4 x float> %res.2
+}
+
 ; GCN-LABEL: {{^}}sample_1d_unorm:
 ; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}}
 define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
@@ -491,6 +674,7 @@ main_body:
 }
 
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
@@ -542,7 +726,9 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
 declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
index 7ca0d77e491..75e861832e2 100644
--- a/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
+++ b/llvm/test/Transforms/InstCombine/AMDGPU/amdgcn-demanded-vector-elts.ll
@@ -1288,6 +1288,28 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
   ret float %elt0
 }
 
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
+; Check that the intrinsic remains unchanged in the presence of TFE or LWE
+; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
+; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+; CHECK: ret float %elt0
+define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
+  %data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
+  %data.vec = extractvalue {<4 x float>,i32} %data, 0
+  %elt0 = extractelement <4 x float> %data.vec, i32 0
+  ret float %elt0
+}
+
 ; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
 ; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
 ; CHECK-NEXT: ret float %data
@@ -1466,6 +1488,7 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_
 }
 
 declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
+declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
 declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
author	David Stuttard <david.stuttard@amd.com>	2019-01-14 11:55:24 +0000
committer	David Stuttard <david.stuttard@amd.com>	2019-01-14 11:55:24 +0000
commit	f77079f892548efa3b34c16233d8779d25d92f58 (patch)
tree	1c3da48a6063defe55051d3955d1a524108ae682 /llvm/test
parent	d1986d1b5a671bb73481f595a9d7e130b11f0d55 (diff)
download	bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.tar.gz bcm5719-llvm-f77079f892548efa3b34c16233d8779d25d92f58.zip