diff options
| author | Changpeng Fang <changpeng.fang@gmail.com> | 2016-11-14 18:33:18 +0000 | 
|---|---|---|
| committer | Changpeng Fang <changpeng.fang@gmail.com> | 2016-11-14 18:33:18 +0000 | 
| commit | 8236fe103f051eac41ff23f6684578b5e31c9871 (patch) | |
| tree | 1fd311f7ad5aec7170aa7a5cbd96bcda1e13d3e4 /llvm/test | |
| parent | 41c52889b90ded563e1aa56cfa7764e083323b00 (diff) | |
| download | bcm5719-llvm-8236fe103f051eac41ff23f6684578b5e31c9871.tar.gz bcm5719-llvm-8236fe103f051eac41ff23f6684578b5e31c9871.zip | |
AMDGPU/SI: Support data types other than V4f32 in image intrinsics
Summary:
  Extend image intrinsics to support data types of V1F32 and V2F32.
  TODO: we should define a mapping table to change the opcode for data type of V2F32 but just one channel is active,
  even though such case should be very rare.
Reviewers:
  tstellarAMD
Differential Revision:
  http://reviews.llvm.org/D26472
llvm-svn: 286860
Diffstat (limited to 'llvm/test')
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll | 19 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll | 43 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll | 19 | 
3 files changed, 80 insertions, 1 deletions
| diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll index 854793e22c2..a65f422742c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll @@ -320,6 +320,23 @@ main_body:    ret void  } +; GCN-LABEL: {{^}}gather4_f32: +; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da +define void @gather4_f32(float addrspace(1)* %out) { +main_body: +  %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1) +  store float %r, float addrspace(1)* %out +  ret void +} + +; GCN-LABEL: {{^}}gather4_v2f32: +; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da +define void @gather4_v2f32(<2 x float> addrspace(1)* %out) { +main_body: +  %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1) +  store <2 x float> %r, <2 x float> addrspace(1)* %out +  ret void +}  declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  declare <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 @@ -360,5 +377,7 @@ declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x f  declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  declare <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll index 5fe03f09176..faef1913dd1 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -48,6 +48,25 @@ main_body:    ret float %elt  } +;CHECK-LABEL: {{^}}image_load_f32_v2i32: +;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +;CHECK: s_waitcnt vmcnt(0) +define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { +main_body: +  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) +  ret float %tex +} + +;CHECK-LABEL: {{^}}image_load_v2f32_v4i32: +;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +;CHECK: s_waitcnt vmcnt(0) +define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { +main_body: +  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) +  ret <2 x float> %tex +} + +  ;CHECK-LABEL: {{^}}image_store_v4i32:  ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm  define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { @@ -72,6 +91,22 @@ main_body:    ret void  } +;CHECK-LABEL: {{^}}image_store_f32_i32: +;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm +define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) { +main_body: +  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0) +  ret void +} + +;CHECK-LABEL: {{^}}image_store_v2f32_v4i32: +;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm +define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) { +main_body: +  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0) +  ret void +} +  ;CHECK-LABEL: {{^}}image_store_mip:  ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm  define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { @@ -93,7 +128,6 @@ main_body:    ret void  } -  ; Ideally, the register allocator would avoid the wait here  ;  ;CHECK-LABEL: {{^}}image_store_wait: @@ -110,6 +144,13 @@ main_body:    ret void  } + +declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 + +  declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0  declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0  declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll index 7a1ef0bf986..752ec2d42fa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll @@ -181,6 +181,23 @@ main_body:    ret void  } +; GCN-LABEL: {{^}}sample_f32: +; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 +define void @sample_f32(float addrspace(1)* %out) { +main_body: +  %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0) +  store float %r, float addrspace(1)* %out +  ret void +} + +; GCN-LABEL: {{^}}sample_v2f32: +; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 +define void @sample_v2f32(<2 x float> addrspace(1)* %out) { +main_body: +  %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0) +  store <2 x float> %r, <2 x float> addrspace(1)* %out +  ret void +}  declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 @@ -204,5 +221,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float>  declare <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  declare <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0  attributes #0 = { nounwind readnone } | 

