diff options
Diffstat (limited to 'llvm/test/CodeGen')
26 files changed, 3213 insertions, 303 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll new file mode 100644 index 00000000000..7431f141032 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/add.v2i16.ll @@ -0,0 +1,283 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] + +; VI: s_add_i32 +; VI: s_add_i32 +define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_add_self_v2i16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]] + +; VI: s_add_i32 +; VI: s_add_i32 +define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %add = add <2 x i16> %a, %a + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI should not scalarize arg access. +; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} + +; VI: v_add_i32 +; VI: v_add_i32 +define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { + %add = add <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}} +define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, <i16 123, i16 456> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}} +define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, <i16 -845, i16 -991> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1: +; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} + +; VI: flat_load_ushort [[LOAD0:v[0-9]+]] +; VI: flat_load_ushort [[LOAD1:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]] +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, <i16 -1, i16 -1> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_add_u16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}} +; VI-NOT: v_add_u16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, <i16 32, i16 0> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; The high element gives fp +; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 +; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_add_u16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}} +; VI-NOT: v_add_u16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = add <2 x i16> %a, <i16 0, i16 16256> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] + +; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI-NOT: and +; VI-NOT: shl +; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI-NOT: and +; VI-NOT: shl +; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GFX9: buffer_store_dwordx4 + +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] + +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI: v_add_u16_e32 +; VI: v_add_u16_e32 +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} + +; VI: buffer_store_dwordx4 +define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 +; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: v_add_u16_e32 +; VI: v_add_u16_e32 +; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; VI: buffer_store_dwordx2 +define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64: +; GCN: flat_load_dword +; GCN: flat_load_dword + +; GFX9: v_pk_add_u16 +; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} + +; VI: v_add_u16_e32 +; VI: v_add_u16_e32 + +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = add <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll index 583a5d12f3c..c0c5305b852 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.ll @@ -26,15 +26,37 @@ entry: define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind { %load = load float, float addrspace(1)* %in, align 4 - %bc = bitcast float %load to <2 x i16> - store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4 + %fadd32 = fadd float %load, 1.0 + %bc = bitcast float %fadd32 to <2 x i16> + %add.bitcast = add <2 x i16> %bc, <i16 2, i16 2> + store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out ret void } define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind { %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 - %bc = bitcast <2 x i16> %load to float - store float %bc, float addrspace(1)* %out, align 4 + %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2> + %bc = bitcast <2 x i16> %add.v2i16 to float + %fadd.bitcast = fadd float %bc, 1.0 + store float %fadd.bitcast, float addrspace(1)* %out + ret void +} + +define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind { + %load = load float, float addrspace(1)* %in, align 4 + %fadd32 = fadd float %load, 1.0 + %bc = bitcast float %fadd32 to <2 x half> + %add.bitcast = fadd <2 x half> %bc, <half 2.0, half 2.0> + store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out + ret void +} + +define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind { + %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4 + %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0> + %bc = bitcast <2 x half> %add.v2f16 to float + %fadd.bitcast = fadd float %bc, 1.0 + store float %fadd.bitcast, float addrspace(1)* %out ret void } @@ -58,7 +80,8 @@ define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8 %add = add <2 x i32> %val, <i32 4, i32 9> %bc = bitcast <2 x i32> %add to double - store double %bc, double addrspace(1)* %out, align 8 + %fadd.bc = fadd double %bc, 1.0 + store double %fadd.bc, double addrspace(1)* %out, align 8 ret void } diff --git a/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll new file mode 100644 index 00000000000..08321563810 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/ashr.v2i16.ll @@ -0,0 +1,152 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_ashr_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; CIVI: v_ashrrev_i32_e32 +; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CIVI: v_or_b32_e32 +define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = ashr <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_ashr_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]] +; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = ashr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4 +define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> <i16 -4, i16 -4>, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = ashr <2 x i16> %vgpr, <i16 8, i16 8> + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_ashr_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = ashr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}ashr_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = ashr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll index c1dbd424ac3..22e04db7d3b 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}extract_vector_elt_v2i16: ; GCN: s_load_dword [[VEC:s[0-9]+]] @@ -70,16 +71,23 @@ define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 } ; GCN-LABEL: {{^}}extract_vector_elt_v4i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short + +; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c +; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30 +; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]] +; GFX9-DAG: buffer_store_short [[VLOAD0]], off +; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]] +; GFX9-DAG: buffer_store_short [[VLOAD1]], off define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 { %p0 = extractelement <4 x i16> %foo, i32 0 %p1 = extractelement <4 x i16> %foo, i32 2 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10 - store i16 %p1, i16 addrspace(1)* %out, align 2 - store i16 %p0, i16 addrspace(1)* %out1, align 2 + store volatile i16 %p1, i16 addrspace(1)* %out, align 2 + store volatile i16 %p0, i16 addrspace(1)* %out1, align 2 ret void } @@ -88,9 +96,12 @@ define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 ; GCN: buffer_load_ushort ; GCN: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short + +; GFX9: buffer_store_dword +; GFX9: buffer_store_dword ; GCN: buffer_load_ushort ; GCN: buffer_store_short @@ -102,18 +113,24 @@ define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> } ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16: -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort -; GCN: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort +; SICIVI: buffer_load_ushort -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short -; GCN: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short +; SICIVI: buffer_store_short -; GCN: buffer_load_ushort -; GCN: buffer_store_short +; SICIVI: buffer_load_ushort +; SICIVI: buffer_store_short + +; GFX9: s_load_dword +; GFX9: buffer_store_dword +; GFX9: buffer_store_dword +; GFX9: buffer_load_ushort +; GFX9: buffer_store_short define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 { %p0 = extractelement <4 x i16> %foo, i32 %idx %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll index 77c941356b9..401cb833740 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll @@ -1,38 +1,40 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s ; DAGCombiner will transform: ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF)) ; unless isFabsFree returns true -; GCN-LABEL: {{^}}fabs_free_f16: +; GCN-LABEL: {{^}}s_fabs_free_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]] ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) { +define void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc= bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) store half %fabs, half addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}fabs_f16: +; GCN-LABEL: {{^}}s_fabs_f16: ; CI: flat_load_ushort [[VAL:v[0-9]+]], ; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define void @fabs_f16(half addrspace(1)* %out, half %in) { +define void @s_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) store half %fabs, half addrspace(1)* %out ret void } ; FIXME: Should be able to use single and -; GCN-LABEL: {{^}}fabs_v2f16: - +; GCN-LABEL: {{^}}s_fabs_v2f16: ; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_or_b32_e32 ; VI: flat_load_ushort [[LO:v[0-9]+]] ; VI: flat_load_ushort [[HI:v[0-9]+]] @@ -43,18 +45,21 @@ define void @fabs_f16(half addrspace(1)* %out, half %in) { ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff, ; VI: v_or_b32 ; VI: flat_store_dword -define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { + +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +define void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) store <2 x half> %fabs, <2 x half> addrspace(1)* %out ret void } -; GCN-LABEL: {{^}}fabs_v4f16: +; GCN-LABEL: {{^}}s_fabs_v4f16: ; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} -; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]] ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}} ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} @@ -63,7 +68,7 @@ define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { ; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} ; GCN: flat_store_dwordx2 -define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { +define void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) store <4 x half> %fabs, <4 x half> addrspace(1)* %out ret void @@ -89,6 +94,57 @@ define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) { ret void } -declare half @llvm.fabs.f16(half) readnone -declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone -declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone +; GCN-LABEL: {{^}}v_fabs_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]] +define void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fabs_free_v2f16: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff +define void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { + %bc = bitcast i32 %in to <2 x half> + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc) + store <2 x half> %fabs, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fabs_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] + +; CI: v_cvt_f32_f16_e32 +; CI: v_cvt_f32_f16_e32 +; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; CI: v_cvt_f16_f32 +; CI: v_cvt_f16_f32 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} +; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}} + +; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]] +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}} +define void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val) + %fmul = fmul <2 x half> %fabs, %val + store <2 x half> %fmul, <2 x half> addrspace(1)* %out + ret void +} + +declare half @llvm.fabs.f16(half) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll index 138e2106e0a..e1edc26d7ed 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -1,4 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s declare half @llvm.fabs.f16(half) #0 declare half @llvm.canonicalize.f16(half) #0 @@ -204,9 +205,12 @@ define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 } ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16: -; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}} -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}} +; GFX9: buffer_store_dword [[REG]] define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) @@ -216,11 +220,14 @@ define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 { ; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16: -; GCN: v_bfe_u32 -; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} -; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_or_b32 +; VI: v_bfe_u32 +; VI: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_or_b32 + +; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}} ; GCN: buffer_store_dword define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out @@ -231,10 +238,13 @@ define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) # } ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16: -; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} -; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} -; GCN: v_or_b32 +; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}} +; VI: v_or_b32 + +; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}} +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}} ; GCN: buffer_store_dword define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out @@ -247,11 +257,14 @@ define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %o ; FIXME: Fold modifier ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16: -; GCN: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} -; GCN: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]] -; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] -; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]] -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}} +; VI: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]] +; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]] +; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]] +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}} +; GFX9: buffer_store_dword [[REG]] define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 { %val = load <2 x half>, <2 x half> addrspace(1)* %out %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val @@ -261,9 +274,12 @@ define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) # } ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16: -; GCN: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}} -; GCN: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}} -; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}} +; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, + +; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}} +; GFX9: buffer_store_dword [[REG]] define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 { %val = bitcast i32 %val.arg to <2 x half> %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val) diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll index 27cea2f8245..498b2ea0afb 100644 --- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s declare half @llvm.copysign.f16(half, half) declare float @llvm.copysign.f32(float, float) @@ -8,7 +9,7 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>) declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>) declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>) -; FUNC-LABEL: {{^}}test_copysign_f16: +; GCN-LABEL: {{^}}test_copysign_f16: ; SI: buffer_load_ushort v[[MAG:[0-9]+]] ; SI: buffer_load_ushort v[[SIGN:[0-9]+]] ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -34,7 +35,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: +; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32: ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -55,7 +56,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: +; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -77,7 +78,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: +; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16: ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -100,7 +101,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: +; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16: ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}} ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -123,7 +124,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32: ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -148,7 +149,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}} ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -173,7 +174,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16: ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]] ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 @@ -200,7 +201,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16: +; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16: ; GCN: v_bfi_b32 ; GCN: s_endpgm define void @test_copysign_out_f16_mag_f64_sign_f16( @@ -216,7 +217,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_v2f16: +; GCN-LABEL: {{^}}test_copysign_v2f16: ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: s_endpgm @@ -230,7 +231,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_v3f16: +; GCN-LABEL: {{^}}test_copysign_v3f16: ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 @@ -245,7 +246,7 @@ entry: ret void } -; FUNC-LABEL: {{^}}test_copysign_v4f16: +; GCN-LABEL: {{^}}test_copysign_v4f16: ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 ; GCN: v_bfi_b32 diff --git a/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll new file mode 100644 index 00000000000..b0123b86b34 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -0,0 +1,107 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 + +; GCN-LABEL: {{^}}fmuladd_v2f16: +; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} + +; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}} +define void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 { + %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1 + %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2 + %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3 + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2) + store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fadd_a_a_b_v2f16: +; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]], +; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]], +; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] +; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] + +; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] +; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, + <2 x half> addrspace(1)* %in1, + <2 x half> addrspace(1)* %in2) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1 + %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid + + %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0 + %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1 + + %add.0 = fadd <2 x half> %r0, %r0 + %add.1 = fadd <2 x half> %add.0, %r1 + store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out + ret void +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll index df1e9a369c8..beeda4cb940 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll @@ -1,16 +1,17 @@ -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16: ; CI: v_cvt_f32_f16_e32 ; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}| ; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}} -; VI-NOT: _and -; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| +; GFX89-NOT: _and +; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}| define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs %fadd = fadd half %y, %fsub store half %fadd, half addrspace(1)* %out, align 2 ret void @@ -22,13 +23,13 @@ define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) { ; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}} ; CI: v_cvt_f16_f32_e32 -; VI-NOT: _and -; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| -; VI-NOT: [[MUL]] -; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] +; GFX89-NOT: _and +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}| +; GFX89-NOT: [[MUL]] +; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]] define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { %fabs = call half @llvm.fabs.f16(half %x) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs %fmul = fmul half %y, %fsub store half %fmul, half addrspace(1)* %out, align 2 ret void @@ -43,17 +44,16 @@ define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) { define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) { %bc = bitcast i16 %in to half %fabs = call half @llvm.fabs.f16(half %bc) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out ret void } -; FIXME: Should use or ; GCN-LABEL: {{^}}fneg_fabs_f16: ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}} define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { %fabs = call half @llvm.fabs.f16(half %in) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out, align 2 ret void } @@ -63,38 +63,91 @@ define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) { define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %val = load half, half addrspace(1)* %in, align 2 %fabs = call half @llvm.fabs.f16(half %val) - %fsub = fsub half -0.000000e+00, %fabs + %fsub = fsub half -0.0, %fabs store half %fsub, half addrspace(1)* %out, align 2 ret void } ; FIXME: single bit op -; GCN-LABEL: {{^}}fneg_fabs_v2f16: -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: store_dword -define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { +; GCN-LABEL: {{^}}s_fneg_fabs_v2f16: +; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: flat_store_dword + +; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}} +define void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) { %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) - %fsub = fsub <2 x half> <half -0.000000e+00, half -0.000000e+00>, %fabs - store <2 x half> %fsub, <2 x half> addrspace(1)* %out + %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs + store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out ret void } ; GCN-LABEL: {{^}}fneg_fabs_v4f16: -; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], -; GCN: store_dwordx2 +; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], +; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], + +; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000 +; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} +; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}} + +; GCN: flat_store_dwordx2 define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) { %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in) - %fsub = fsub <4 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs + %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs store <4 x half> %fsub, <4 x half> addrspace(1)* %out ret void } -declare half @llvm.fabs.f16(half) readnone -declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone -declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone +; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16: +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}| +; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}} + +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 +; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0 + +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] +define void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs + %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0> + store <2 x half> %mul, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16: +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]] +; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]] +define void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs + store <2 x half> %fabs, <2 x half> addrspace(1)* %out0 + store <2 x half> %fneg, <2 x half> addrspace(1)* %out1 + ret void +} + +; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16: +; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0] +define void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) { + %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in) + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs + %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0> + store <2 x half> %fabs, <2 x half> addrspace(1)* %out0 + store <2 x half> %mul, <2 x half> addrspace(1)* %out1 + ret void +} + +declare half @llvm.fabs.f16(half) #1 +declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1 +declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll index d545cc789d8..d0c6d3d5155 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,11 +1,11 @@ -; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s ; FIXME: Should be able to do scalar op -; FUNC-LABEL: {{^}}s_fneg_f16: - -define void @s_fneg_f16(half addrspace(1)* %out, half %in) { - %fneg = fsub half -0.000000e+00, %in +; GCN-LABEL: {{^}}s_fneg_f16: +define void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 { + %fneg = fsub half -0.0, %in store half %fneg, half addrspace(1)* %out ret void } @@ -13,32 +13,35 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) { ; FIXME: Should be able to use bit operations when illegal type as ; well. -; FUNC-LABEL: {{^}}v_fneg_f16: +; GCN-LABEL: {{^}}v_fneg_f16: ; GCN: flat_load_ushort [[VAL:v[0-9]+]], ; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]] ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] ; SI: buffer_store_short [[XOR]] -define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) { - %val = load half, half addrspace(1)* %in, align 2 - %fneg = fsub half -0.000000e+00, %val - store half %fneg, half addrspace(1)* %out +define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid + %val = load half, half addrspace(1)* %gep.in, align 2 + %fneg = fsub half -0.0, %val + store half %fneg, half addrspace(1)* %gep.out ret void } -; FUNC-LABEL: {{^}}fneg_free_f16: +; GCN-LABEL: {{^}}fneg_free_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]], ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}} ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]] ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]] -define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) { +define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 { %bc = bitcast i16 %in to half %fsub = fsub half -0.0, %bc store half %fsub, half addrspace(1)* %out ret void } -; FUNC-LABEL: {{^}}v_fneg_fold_f16: +; GCN-LABEL: {{^}}v_fneg_fold_f16: ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]] ; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]] @@ -49,10 +52,83 @@ define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) { ; VI-NOT: [[NEG_VALUE]] ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]] -define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) { +define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %val = load half, half addrspace(1)* %in %fsub = fsub half -0.0, %val %fmul = fmul half %fsub, %val store half %fmul, half addrspace(1)* %out ret void } + +; FIXME: Terrible code with VI and even worse with SI/CI +; GCN-LABEL: {{^}}s_fneg_v2f16: +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}} +; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 + +; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}} +; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] +; VI: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] + +; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}} +define void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 { + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in + store <2 x half> %fneg, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]] +define void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid + %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2 + %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val + store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out + ret void +} + +; GCN-LABEL: {{^}}fneg_free_v2f16: +; GCN: s_load_dword [[VAL:s[0-9]+]] +; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000 + +; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]] +; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]] +define void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 { + %bc = bitcast i32 %in to <2 x half> + %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc + store <2 x half> %fsub, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] + +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} +; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}} +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 +; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_cvt_f16_f32 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}} +define void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val + %fmul = fmul <2 x half> %fsub, %val + store <2 x half> %fmul, <2 x half> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll index 433fdf1e075..1f0cc1511d4 100644 --- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s ; GCN-LABEL: {{^}}fpext_f16_to_f32 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -8,7 +9,7 @@ ; GCN: s_endpgm define void @fpext_f16_to_f32( float addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = fpext half %a.val to float @@ -24,7 +25,7 @@ entry: ; GCN: s_endpgm define void @fpext_f16_to_f64( double addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = fpext half %a.val to double @@ -34,15 +35,15 @@ entry: ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] +; GFX89-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] ; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]] ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}} ; GCN: s_endpgm define void @fpext_v2f16_to_v2f32( <2 x float> addrspace(1)* %r, - <2 x half> addrspace(1)* %a) { + <2 x half> addrspace(1)* %a) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = fpext <2 x half> %a.val to <2 x float> @@ -51,13 +52,14 @@ entry: } ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64 -; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] -; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]] -; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]] -; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}} +; GCN: buffer_load_dword +; GCN-DAG: v_lshrrev_b32_e32 +; GCN-DAG: v_cvt_f32_f16_e32 +; GCN: v_cvt_f32_f16_e32 + +; GCN: v_cvt_f64_f32_e32 +; GCN: v_cvt_f64_f32_e32 +; GCN: buffer_store_dwordx4 ; GCN: s_endpgm define void @fpext_v2f16_to_v2f64( <2 x double> addrspace(1)* %r, @@ -129,7 +131,7 @@ entry: ; FIXME: Using the source modifier here only wastes code size ; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]] -; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] +; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] ; GCN: store_dword [[CVT]] ; GCN: store_short [[XOR]] @@ -152,8 +154,8 @@ entry: ; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]] ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] -; VI-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] -; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] +; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]] +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]] ; GCN: buffer_store_dword [[CVTA_NEG]] ; GCN: buffer_store_short [[MUL]] @@ -198,8 +200,8 @@ entry: ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] ; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]] -; VI-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| -; VI: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] +; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]| +; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]] ; GCN: buffer_store_dword [[ABS_A]] ; GCN: buffer_store_short [[MUL]] @@ -245,8 +247,8 @@ entry: ; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]] ; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]] -; VI-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| -; VI-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] +; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]| +; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]] ; GCN: buffer_store_dword [[FABS_FNEG]] ; GCN: buffer_store_short [[MUL]] diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll index c9905d5f7ff..b1879777b44 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll @@ -1,5 +1,7 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s ; GCN-LABEL: {{^}}fptrunc_f32_to_f16: ; GCN: buffer_load_dword v[[A_F32:[0-9]+]] @@ -36,9 +38,15 @@ entry: ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}} ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] +; SIVI-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] + +; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define void @fptrunc_v2f32_to_v2f16( @@ -57,9 +65,16 @@ entry: ; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}} ; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]] ; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]] + +; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]] + ; GCN: buffer_store_dword v[[R_V2_F16]] define void @fptrunc_v2f64_to_v2f16( <2 x half> addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index 15655695239..e62dc01592f 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -1,5 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s ; GCN-LABEL: {{^}}fsub_f16: ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] @@ -8,7 +9,7 @@ ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] +; GFX89: v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fsub_f16( @@ -28,7 +29,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] +; GFX89: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fsub_f16_imm_a( @@ -46,7 +47,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] +; GFX89: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fsub_f16_imm_b( @@ -62,8 +63,9 @@ entry: ; GCN-LABEL: {{^}}fsub_v2f16: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SIVI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SIVI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] @@ -72,11 +74,16 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] -; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] +; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]] + +; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define void @fsub_v2f16( @@ -94,17 +101,24 @@ entry: ; GCN-LABEL: {{^}}fsub_v2f16_imm_a: ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] +; SIVI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]] ; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] + ; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] ; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00 +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1] + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define void @fsub_v2f16_imm_a( @@ -120,7 +134,8 @@ entry: ; GCN-LABEL: {{^}}fsub_v2f16_imm_b: ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] -; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] +; SIVI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]] + ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] @@ -128,9 +143,14 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] ; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] -; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] -; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] -; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; SIVI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] + +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000 +; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}} + ; GCN: buffer_store_dword v[[R_V2_F16]] ; GCN: s_endpgm define void @fsub_v2f16_imm_b( diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll new file mode 100644 index 00000000000..c1ed3952cba --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -0,0 +1,447 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; FIXME: Merge into imm.ll + +; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 { + store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}} +; GCN: buffer_store_dword [[REG]] +define void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_literal_imm_v2f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00 +; GCN: buffer_store_dword [[REG]] +define void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 { + store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0.0, half 0.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0.5, half 0.5> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half -0.5, half -0.5> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 1.0, half 1.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half -1.0, half -1.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 2.0, half 2.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half -2.0, half -2.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 4.0, half 4.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half -4.0, half -4.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_dword +; VI-NOT: and +; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}} +; VI: v_or_b32 +; VI: buffer_store_dword +define void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %x = load <2 x half>, <2 x half> addrspace(1)* %in + %y = fadd <2 x half> %x, <half 0.5, half 0.5> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_literal_v2f16: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400 +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] +; GFX9: buffer_store_dword [[REG]] + +; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}} +; VI-DAG: buffer_load_dword +; VI-NOT: and +; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}} +; VI: v_or_b32 +; VI: buffer_store_dword +define void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { + %x = load <2 x half>, <2 x half> addrspace(1)* %in + %y = fadd <2 x half> %x, <half 1024.0, half 1024.0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_16_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xHFFFF, half 0xHFFFF> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xHFFFE, half 0xHFFFE> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -16{{$}} +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xHFFF0, half 0xHFFF0> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_63_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_64_v2f16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64 +; GFX9: buffer_store_dword [[REG]] + +; VI: buffer_load_ushort [[VAL0:v[0-9]+]] +; VI: buffer_load_ushort [[VAL1:v[0-9]+]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]] +; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]] +; VI: v_or_b32 +; VI: buffer_store_dword +define void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 { + %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040> + store <2 x half> %y, <2 x half> addrspace(1)* %out + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 2cd4d0c6be9..079a441f81c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,4 +1,5 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: @@ -6,6 +7,9 @@ ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}} + +; GFX9-NOT: lshr +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]] define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 0 @@ -20,6 +24,10 @@ define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] + +; GFX9-NOT: [[ELT0]] +; GFX9-NOT: [[VEC]] +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]] define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 @@ -27,12 +35,39 @@ define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> ret void } -; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi: +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg: ; GCN: s_load_dword [[ELT0:s[0-9]+]] ; GCN: s_load_dword [[VEC:s[0-9]+]] +; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}} +; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16 +; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CIVI-DAG: ; use [[SHR]] + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; GFX9-DAG: ; use [[ELT1]] +define void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt1 = extractelement <2 x i16> %vec, i32 1 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt1 to i32 + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]] + ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} -; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]] + +; GFX9-NOT: [[ELT0]] +; GFX9-NOT: [[VEC]] +; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]] define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %elt.hi = lshr i32 %elt.arg, 16 @@ -42,12 +77,65 @@ define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1 ret void } +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]], + +; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}} +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16 +; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]] +; GFX9: ; use [[ELT1]] +define void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt to i32 + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + ret void +} + +; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1: +; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1] +; GCN: s_load_dword [[VEC:s[0-9]+]], + +; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16 +; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16 +; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] + +; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16 +; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16 +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]] +; GFX9: ; use [[ELT_HI]] +; GFX9: ; use [[VEC_HI]] +define void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 { + %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr + %elt.hi = lshr i32 %elt.arg, 16 + %elt = trunc i32 %elt.hi to i16 + %vec.hi = extractelement <2 x i16> %vec, i32 1 + %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0 + store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out + %use1 = zext i16 %elt to i32 + %vec.hi.use1 = zext i16 %vec.hi to i32 + + call void asm sideeffect "; use $0", "s"(i32 %use1) #0 + call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0 + ret void +} + ; GCN-LABEL: {{^}}s_insertelement_v2i16_1: ; GCN: s_load_dword [[VEC:s[0-9]+]] ; GCN-NOT: s_lshr -; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} -; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000 + +; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} +; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000 + +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7 define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 999, i32 1 @@ -63,6 +151,7 @@ define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]] ; GCN-NOT: shlr +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]] define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 { %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1 @@ -74,6 +163,9 @@ define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> ; GCN: s_load_dword [[VEC:s[0-9]+]] ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500 + +; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16 +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]] define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0 @@ -82,10 +174,13 @@ define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a } ; GCN-LABEL: {{^}}s_insertelement_v2f16_1: -; GCN: s_load_dword [[VEC:s[0-9]+]] +; GFX9: s_load_dword [[VEC:s[0-9]+]] ; GCN-NOT: s_lshr -; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} -; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000 + +; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}} +; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000 + +; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500 define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 { %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1 @@ -97,6 +192,10 @@ define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]] + +; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}} +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -117,6 +216,10 @@ define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]] +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]] +; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]] + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -137,6 +240,9 @@ define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] +; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}} +; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]] + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -155,6 +261,10 @@ define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -171,7 +281,7 @@ define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]] - +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -190,6 +300,10 @@ define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]] +; GFX9: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}} +; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]] + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -208,6 +322,8 @@ define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]] +; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -224,6 +340,10 @@ define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x4500 +; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]] + ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -240,7 +360,7 @@ define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]] ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]] - +; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]] ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]] define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -295,9 +415,9 @@ define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7 -; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] -; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; GFX89: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] ; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] @@ -322,9 +442,9 @@ define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 ; GCN: flat_load_dword [[VEC:v[0-9]+]] ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234 -; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] -; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] +; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} +; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] +; GFX89: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]] ; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]] ; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll index 37ad164b773..a8f6579acbf 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0 declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #0 diff --git a/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll new file mode 100644 index 00000000000..ee07b0e9766 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_lshr_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; CIVI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = lshr <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_lshr_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]] +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16 +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = lshr <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 +define void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = lshr <2 x i16> %vgpr, <i16 8, i16 8> + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_lshr_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = lshr <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}lshr_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/max.i16.ll b/llvm/test/CodeGen/AMDGPU/max.i16.ll index 3f2a87f2069..d442c8754b7 100644 --- a/llvm/test/CodeGen/AMDGPU/max.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -1,11 +1,9 @@ -; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s - - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_i16: -; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -20,11 +18,55 @@ define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr } ; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_v2i16: +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_imax_sge_v3i16: +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI-NOT: v_max_i16 + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid + %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4 + %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp sge <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sge_v4i16: ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid @@ -40,7 +82,7 @@ define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrs ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_imax_sgt_i16: -; VI: v_max_i16_e32 +; VIPLUS: v_max_i16_e32 define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -56,7 +98,7 @@ define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_uge_i16: -; VI: v_max_u16_e32 +; VIPLUS: v_max_u16_e32 define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -72,7 +114,7 @@ define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr ; FIXME: Need to handle non-uniform case for function below (load without gep). ; GCN-LABEL: {{^}}v_test_umax_ugt_i16: -; VI: v_max_u16_e32 +; VIPLUS: v_max_u16_e32 define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -85,3 +127,23 @@ define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr store i16 %val, i16 addrspace(1)* %outgep, align 4 ret void } + +; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16: +; VI: v_max_u16_e32 +; VI: v_max_u16_e32 + +; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind { + %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone + %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid + %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid + %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4 + %cmp = icmp ugt <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll index 19d0117d64a..c7a89e09588 100644 --- a/llvm/test/CodeGen/AMDGPU/min.ll +++ b/llvm/test/CodeGen/AMDGPU/min.ll @@ -1,17 +1,22 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}v_test_imin_sle_i32: ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } @@ -19,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp sle i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -30,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp sle <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -47,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind { +define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 { %cmp = icmp sle <4 x i32> %a, %b %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b store <4 x i32> %val, <4 x i32> addrspace(1)* %out @@ -60,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, < ; GCN: s_sext_i32_i8 ; GCN: s_sext_i32_i8 ; GCN: s_min_i32 -define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { +define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 { %cmp = icmp sle i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b store i8 %val, i8 addrspace(1)* %out @@ -90,30 +95,62 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind { ; VI: v_min_i32 ; VI: v_min_i32 +; GFX9: v_min_i16 +; GFX9: v_min_i16 +; GFX9: v_min_i16 +; GFX9: v_min_i16 + ; GCN: s_endpgm ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind { +define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 { %cmp = icmp sle <4 x i8> %a, %b %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b store <4 x i8> %val, <4 x i8> addrspace(1)* %out ret void } +; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16: +; SI: v_min_i32 +; SI: v_min_i32 + +; VI: v_min_i32 +; VI: v_min_i32 + +; GFX9: v_pk_min_i16 + +; EG: MIN_INT +; EG: MIN_INT +define void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %cmp = icmp sle <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI use s_min_i32 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16: ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 ; SI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 +; VI: v_min_i32 + +; GFX9: v_pk_min_i16 +; GFX9: v_pk_min_i16 + ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind { +define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 { %cmp = icmp sle <4 x i16> %a, %b %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b store <4 x i16> %val, <4 x i16> addrspace(1)* %out @@ -124,12 +161,36 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, < ; GCN: v_min_i32_e32 ; EG: MIN_INT -define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 + ret void +} + +; FUNC-LABEL: @v_test_imin_slt_i16 +; SI: v_min_i32_e32 + +; GFX89: v_min_i16_e32 + +; EG: MIN_INT +define void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid + %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid + %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid + + %a = load i16, i16 addrspace(1)* %a.gep + %b = load i16, i16 addrspace(1)* %b.gep + %cmp = icmp slt i16 %a, %b + %val = select i1 %cmp, i16 %a, i16 %b + store i16 %val, i16 addrspace(1)* %out.gep ret void } @@ -137,7 +198,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp slt i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -150,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; EG: MIN_INT ; EG: MIN_INT -define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind { +define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 { %cmp = icmp slt <2 x i32> %a, %b %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b store <2 x i32> %val, <2 x i32> addrspace(1)* %out @@ -161,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, < ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp slt i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -172,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}} -define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { +define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 { %cmp = icmp sle i32 %a, 8 %val = select i1 %cmp, i32 %a, i32 8 store i32 %val, i32 addrspace(1)* %out, align 4 @@ -183,12 +244,16 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind { ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } @@ -196,25 +261,65 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr ; GCN: v_min_u32_e32 ; GCN: v_min_u32_e32 ; GCN: v_min_u32_e32 -; SI-NOT: v_min_u32_e32 +; GCN-NOT: v_min_u32_e32 ; GCN: s_endpgm ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind { - %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr - %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr +define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid + + %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep + %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep %cmp = icmp ule <3 x i32> %a, %b %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b - store <3 x i32> %val, <3 x i32> addrspace(1)* %out + store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep + ret void +} + +; FIXME: Reduce unused packed component to scalar +; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}} +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI: v_min_u32_e32 +; SI-NOT: v_min_u32_e32 + +; VI: v_min_u16_e32 +; VI: v_min_u16_e32 +; VI: v_min_u16_e32 +; VI-NOT: v_min_u16_e32 + +; GFX9: v_pk_min_u16 +; GFX9: v_pk_min_u16 + +; GCN: s_endpgm + +; EG: MIN_UINT +; EG: MIN_UINT +; EG: MIN_UINT +define void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid + + %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep + %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep + %cmp = icmp ule <3 x i16> %a, %b + %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b + store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep ret void } + ; FUNC-LABEL: @s_test_umin_ule_i32 ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ule i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -225,27 +330,40 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: v_min_u32_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { - %a = load i32, i32 addrspace(1)* %aptr, align 4 - %b = load i32, i32 addrspace(1)* %bptr, align 4 +define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid + %a = load i32, i32 addrspace(1)* %a.gep, align 4 + %b = load i32, i32 addrspace(1)* %b.gep, align 4 %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b - store i32 %val, i32 addrspace(1)* %out, align 4 + store i32 %val, i32 addrspace(1)* %out.gep, align 4 ret void } ; FUNC-LABEL: {{^}}v_test_umin_ult_i8: -; GCN: buffer_load_ubyte -; GCN: buffer_load_ubyte -; GCN: v_min_u32_e32 +; SI: buffer_load_ubyte +; SI: buffer_load_ubyte +; SI: v_min_u32_e32 + +; GFX89: flat_load_ubyte +; GFX89: flat_load_ubyte +; GFX89: v_min_u16_e32 ; EG: MIN_UINT -define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind { - %a = load i8, i8 addrspace(1)* %aptr, align 1 - %b = load i8, i8 addrspace(1)* %bptr, align 1 +define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid + + %a = load i8, i8 addrspace(1)* %a.gep, align 1 + %b = load i8, i8 addrspace(1)* %b.gep, align 1 %cmp = icmp ult i8 %a, %b %val = select i1 %cmp, i8 %a, i8 %b - store i8 %val, i8 addrspace(1)* %out, align 1 + store i8 %val, i8 addrspace(1)* %out.gep, align 1 ret void } @@ -253,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind { +define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %cmp = icmp ult i32 %a, %b %val = select i1 %cmp, i32 %a, i32 %b store i32 %val, i32 addrspace(1)* %out, align 4 @@ -268,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind { +define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 { %a = load i32, i32 addrspace(1)* %aptr, align 4 %b = load i32, i32 addrspace(1)* %bptr, align 4 %cmp = icmp ult i32 %a, %b @@ -286,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace ; GCN: s_endpgm ; EG-NOT: MIN_UINT -define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind { +define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 { %a = load i16, i16 addrspace(1)* %aptr, align 2 %b = load i16, i16 addrspace(1)* %bptr, align 2 %cmp = icmp ult i16 %a, %b @@ -301,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace ; GCN: s_min_u32 ; EG: MIN_UINT -define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind { +define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 { %cmp = icmp ult <1 x i32> %a, %b %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b store <1 x i32> %val, <1 x i32> addrspace(1)* %out @@ -326,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, < ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind { +define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 { %cmp = icmp ult <8 x i32> %a, %b %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b store <8 x i32> %val, <8 x i32> addrspace(1)* %out @@ -334,14 +452,23 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, < } ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16: -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 -; GCN: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 +; SI: v_min_u32 + +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 +; VI: v_min_u32 ; EG: MIN_UINT ; EG: MIN_UINT @@ -351,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, < ; EG: MIN_UINT ; EG: MIN_UINT ; EG: MIN_UINT -define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind { +define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 { %cmp = icmp ult <8 x i16> %a, %b %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b store <8 x i16> %val, <8 x i16> addrspace(1)* %out @@ -367,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, < ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_UINT -define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind { +define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 { %a.ext = zext i16 %a to i32 %b.ext = zext i16 %b to i32 %cmp = icmp ult i32 %a.ext, %b.ext @@ -387,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1 ; GCN: buffer_store_dword [[VMIN]] ; EG: MIN_INT -define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind { +define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 { %a.ext = sext i16 %a to i32 %b.ext = sext i16 %b to i32 %cmp = icmp slt i32 %a.ext, %b.ext @@ -402,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 ; GCN: s_min_i32 ; EG: MIN_INT -define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind { +define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 { %cmp = icmp sle i16 %a, %b %val = select i1 %cmp, i16 %a, i16 %b store i16 %val, i16 addrspace(1)* %out @@ -415,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ult i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -427,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG: MIN_UINT ; EG: MIN_UINT -define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp ule i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -439,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp slt i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 @@ -451,9 +578,63 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind ; EG-DAG: MIN_UINT ; EG-DAG: MIN_INT -define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind { +define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { %tmp = icmp sle i64 %a, %b %val = select i1 %tmp, i64 %a, i64 %b store i64 %val, i64 addrspace(1)* %out, align 8 ret void } + +; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16: +; SI: v_min_i32 +; SI: v_min_i32 + +; VI: v_min_i16 +; VI: v_min_i16 + +; GFX9: v_pk_min_i16 + +; EG: MIN_INT +; EG: MIN_INT +define void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep + %cmp = icmp sle <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; FIXME: i16 min +; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16: +; SI: v_min_u32 +; SI: v_min_u32 + +; VI: v_min_u16 +; VI: v_min_u16 + +; GFX9: v_pk_min_u16 + +; EG: MIN_UINT +; EG: MIN_UINT +define void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() + %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid + %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep + %cmp = icmp ule <2 x i16> %a, %b + %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b + store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll new file mode 100644 index 00000000000..012d19d3751 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll @@ -0,0 +1,229 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + + +; GCN-LABEL: {{^}}s_pack_v2f16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] +; GFX9: ; use [[PACKED]] +define void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(2)* %in0 + %val1 = load volatile i32, i32 addrspace(2)* %in1 + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo: +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]] +; GFX9: ; use [[PACKED]] +define void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 { + %val1 = load i32, i32 addrspace(2)* %in1 + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234 +; GFX9: ; use [[PACKED]] +define void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 { + %val0 = load i32, i32 addrspace(2)* %in0 + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] + +; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_user: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] + +; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]] + +; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +define void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo.i = trunc i32 %val0 to i16 + %hi.i = trunc i32 %val1 to i16 + %lo = bitcast i16 %lo.i to half + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + %foo = add i32 %vec.i32, 9 + store volatile i32 %foo, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo: +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: s_movk_i32 [[K:s[0-9]+]], 0x1234{{$}} +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[K]], [[VAL1]] + +; GFX9-FLUSH: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}} +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo: +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], 4.0, [[VAL1]] + +; GFX9-FLUSH: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}} +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi.i = trunc i32 %val1 to i16 + %hi = bitcast i16 %hi.i to half + %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9-DENORM: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[K]] + +; GFX9-FLUSH: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 1.0 + +; GFX9-FLUSH: s_movk_i32 [[K:s[0-9]+]], 0x3c00 +; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 64 + +; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo.i = trunc i32 %val0 to i16 + %lo = bitcast i16 %lo.i to half + %vec.0 = insertelement <2 x half> undef, half %lo, i32 0 + %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1 + %vec.i32 = bitcast <2 x half> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll new file mode 100644 index 00000000000..fafab687a48 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s + + +; GCN-LABEL: {{^}}s_pack_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]] +; GFX9: ; use [[PACKED]] +define void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 { + %val0 = load volatile i32, i32 addrspace(2)* %in0 + %val1 = load volatile i32, i32 addrspace(2)* %in1 + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2i16_imm_lo: +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]] +; GFX9: ; use [[PACKED]] +define void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 { + %val1 = load i32, i32 addrspace(2)* %in1 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}s_pack_v2i16_imm_hi: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8 +; GFX9: ; use [[PACKED]] +define void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 { + %val0 = load i32, i32 addrspace(2)* %in0 + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + + call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] + +; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] +; GFX9: ; use [[PACKED]] +define void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_user: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]] + +; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]] +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]] + +; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]] +define void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %lo = trunc i32 %val0 to i16 + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + %foo = add i32 %vec.i32, 9 + store volatile i32 %foo, i32 addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_imm_lo: +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}} +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[K]], [[VAL1]] + +; GFX9-FLUSH: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b{{$}} +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_lo: +; GFX9: flat_load_dword [[VAL1:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], 64, [[VAL1]] + +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64 +; GFX9: ; use [[PACKED]] +define void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext + %val1 = load volatile i32, i32 addrspace(1)* %in1.gep + %hi = trunc i32 %val1 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_imm_hi: +; GFX9: flat_load_dword [[VAL0:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[K]] + +; GFX9-FLUSH: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}} +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]] + +; GFX9: ; use [[PACKED]] +define void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_hi: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 7 +; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]] +; GFX9: ; use [[PACKED]] +define void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext + %val0 = load volatile i32, i32 addrspace(1)* %in0.gep + %lo = trunc i32 %val0 to i16 + %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0 + %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1 + %vec.i32 = bitcast <2 x i16> %vec.1 to i32 + call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll index f8f21af5bda..35886f85618 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll @@ -67,10 +67,9 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac ; ret void ; } -; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind { -; %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0 -; %bc = bitcast <2 x i32> %newvec0 to <4 x i16> -; %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4> -; store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16 -; ret void -; } +define void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind { + %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0 + %bc = bitcast <4 x i8> %newvec0 to <2 x half> + store <2 x half> %bc, <2 x half> addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll index 4c58261709c..015448cfdb3 100644 --- a/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -1,8 +1,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; FIXME: i16 promotion pass ruins the scalar cases when legal. +; FIXME: r600 fails verifier ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32: ; GCN: s_load_dword [[ARG:s[0-9]+]], @@ -150,14 +152,14 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 { ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -177,14 +179,14 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -204,14 +206,14 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* % ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -231,11 +233,11 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -606,15 +608,15 @@ define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1 ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}} ; GCN-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -636,15 +638,15 @@ define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrsp ; SI: buffer_load_dwordx2 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -; VI: flat_load_dwordx2 -; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, +; GFX89: flat_load_dwordx2 +; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, ; GCN-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.r600.read.tidig.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -668,9 +670,9 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15 define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 @@ -687,9 +689,9 @@ define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14 define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 { %ld = load i32, i32 addrspace(2)* %ptr %in = trunc i32 %ld to i16 @@ -721,7 +723,7 @@ define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr ; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]] ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] -; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] +; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]] ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}} ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]] @@ -748,9 +750,9 @@ define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace( ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}} define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 14 %sext = ashr i16 %shl, 14 @@ -765,9 +767,9 @@ define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}} define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 8 %sext = ashr i16 %shl, 8 @@ -782,9 +784,9 @@ define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]] ; SI: buffer_store_short [[VBFE]] -; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} -; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} -; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} +; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} +; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}} +; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}} define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { %shl = shl i16 %in, 1 %sext = ashr i16 %shl, 1 @@ -792,6 +794,72 @@ define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 { ret void } +; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]] +define void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, <i16 15, i16 15> + %ashr = ashr <2 x i16> %shl, <i16 15, i16 15> + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16: +; GFX9: v_pk_add_u16 +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}} +define void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { + %c = add <3 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15> + %ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15> + store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i2_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]] +define void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, <i16 14, i16 14> + %ashr = ashr <2 x i16> %shl, <i16 14, i16 14> + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i16: +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]] +; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]] +define void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 { + %c = add <2 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <2 x i16> %c, <i16 8, i16 8> + %ashr = ashr <2 x i16> %shl, <i16 8, i16 8> + store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16: +; GFX9: v_pk_add_u16 +; GFX9: v_pk_add_u16 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}} +define void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 { + %c = add <3 x i16> %a, %b ; add to prevent folding into extload + %shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8> + %ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8> + store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out + ret void +} + declare i32 @llvm.r600.read.tidig.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll new file mode 100644 index 00000000000..bf4a6238eed --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -0,0 +1,150 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s + +; GCN-LABEL: {{^}}s_shl_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]] + +; CIVI: v_lshlrev_b32_e32 +; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}} +; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CIVI: v_or_b32_e32 +define void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 { + %result = shl <2 x i16> %lhs, %rhs + store <2 x i16> %result, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_shl_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}} +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]] +; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} +; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} +; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr + %result = shl <2 x i16> %a, %b + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_s_v2i16: +; GFX9: s_load_dword [[RHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %vgpr, %sgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_s_v_v2i16: +; GFX9: s_load_dword [[LHS:s[0-9]+]] +; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]] +define void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %sgpr, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_imm_v_v2i16: +; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8 +define void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> <i16 8, i16 8>, %vgpr + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_imm_v2i16: +; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]] +; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]] +define void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep + %result = shl <2 x i16> %vgpr, <i16 8, i16 8> + store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_shl_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1 + %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr + %result = shl <4 x i16> %a, %b + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}shl_v_imm_v4i16: +; GCN: {{buffer|flat}}_load_dwordx2 +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}} +; GCN: {{buffer|flat}}_store_dwordx2 +define void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext + %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext + %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep + %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8> + store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll new file mode 100644 index 00000000000..01d19a0c24b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll @@ -0,0 +1,216 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s + +; GCN-LABEL: {{^}}s_abs_v2i16: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 + +; CIVI: v_sub_i32_e32 +; CIVI-DAG: v_sub_i32_e32 +; CIVI: v_bfe_i32 +; CIVI-DAG: v_bfe_i32 +; CIVI-DAG: v_add_i32_e32 +; CIVI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16 +; CIVI: v_add_i32_e32 +; CIVI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, +; CIVI: v_or_b32_e32 +define void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { + %neg = sub <2 x i16> zeroinitializer, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, <i16 2, i16 2> + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v2i16: +; GFX9: flat_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 + +; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} +; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}} +; VI-NOT: v_and_b32 +; VI: v_or_b32_e32 +define void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4 + %neg = sub <2 x i16> zeroinitializer, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, <i16 2, i16 2> + store <2 x i16> %res2, <2 x i16> addrspace(1)* %gep.out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_abs_v2i16_2: +; GFX9: s_load_dword [[VAL:s[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +define void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 { + %z0 = insertelement <2 x i16> undef, i16 0, i16 0 + %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 + %t0 = insertelement <2 x i16> undef, i16 2, i16 0 + %t1 = insertelement <2 x i16> %t0, i16 2, i16 1 + %neg = sub <2 x i16> %z1, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, %t1 + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v2i16_2: +; GFX9: buffer_load_dword [[VAL:v[0-9]+]] +; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]] +; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]] +; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2 +define void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 { + %z0 = insertelement <2 x i16> undef, i16 0, i16 0 + %z1 = insertelement <2 x i16> %z0, i16 0, i16 1 + %t0 = insertelement <2 x i16> undef, i16 2, i16 0 + %t1 = insertelement <2 x i16> %t0, i16 2, i16 1 + %val = load <2 x i16>, <2 x i16> addrspace(1)* %src, align 4 + %neg = sub <2 x i16> %z1, %val + %cond = icmp sgt <2 x i16> %val, %neg + %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg + %res2 = add <2 x i16> %res, %t1 + store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_abs_v4i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]] +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 + +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +define void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 { + %z0 = insertelement <4 x i16> undef, i16 0, i16 0 + %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 + %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 + %z3 = insertelement <4 x i16> %z2, i16 0, i16 3 + %t0 = insertelement <4 x i16> undef, i16 2, i16 0 + %t1 = insertelement <4 x i16> %t0, i16 2, i16 1 + %t2 = insertelement <4 x i16> %t1, i16 2, i16 2 + %t3 = insertelement <4 x i16> %t2, i16 2, i16 3 + %neg = sub <4 x i16> %z3, %val + %cond = icmp sgt <4 x i16> %val, %neg + %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg + %res2 = add <4 x i16> %res, %t3 + store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_abs_v4i16: +; GFX9: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}} + +; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]] +; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]] +; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 + +; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]] +; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]] +; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 +define void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 { + %z0 = insertelement <4 x i16> undef, i16 0, i16 0 + %z1 = insertelement <4 x i16> %z0, i16 0, i16 1 + %z2 = insertelement <4 x i16> %z1, i16 0, i16 2 + %z3 = insertelement <4 x i16> %z2, i16 0, i16 3 + %t0 = insertelement <4 x i16> undef, i16 2, i16 0 + %t1 = insertelement <4 x i16> %t0, i16 2, i16 1 + %t2 = insertelement <4 x i16> %t1, i16 2, i16 2 + %t3 = insertelement <4 x i16> %t2, i16 2, i16 3 + %val = load <4 x i16>, <4 x i16> addrspace(1)* %src, align 4 + %neg = sub <4 x i16> %z3, %val + %cond = icmp sgt <4 x i16> %val, %neg + %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg + %res2 = add <4 x i16> %res, %t3 + store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_min_max_v2i16: +define void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 { + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_min_max_v2i16: +define void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { + %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 + %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 + + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}s_min_max_v4i32: +define void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 { + %cond0 = icmp sgt <4 x i16> %val0, %val1 + %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1 + %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0 + + store volatile <4 x i16> %sel0, <4 x i16> addrspace(1)* %out0, align 4 + store volatile <4 x i16> %sel1, <4 x i16> addrspace(1)* %out1, align 4 + ret void +} + +; GCN-LABEL: {{^}}v_min_max_v2i16_user: +define void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 { + %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0 + %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1 + + %cond0 = icmp sgt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + store volatile <2 x i1> %cond0, <2 x i1> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}u_min_max_v2i16: +; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} +; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} +define void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind { + %cond0 = icmp ugt <2 x i16> %val0, %val1 + %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1 + %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0 + + store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4 + store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll new file mode 100644 index 00000000000..96686cf01ab --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -0,0 +1,278 @@ +; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} + +; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} +define void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_v2i16: +; GFX9: s_load_dword [[VAL0:s[0-9]+]] +; GFX9: s_load_dword [[VAL1:s[0-9]+]] +; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]] +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]] + +; VI: s_sub_i32 +; VI: s_sub_i32 +define void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1 + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}s_test_sub_self_v2i16: +; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]] +; GCN: buffer_store_dword [[ZERO]] +define void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 { + %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0 + %add = sub <2 x i16> %a, %a + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: VI should not scalarize arg access. +; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} + +; VI: v_subrev_i32_e32 +; VI: v_subrev_i32_e32 +define void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 { + %add = sub <2 x i16> %a, %b + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}} +define void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, <i16 123, i16 456> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant: +; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}} + +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}} +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}} +define void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, <i16 -845, i16 -991> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1: +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}} + +; VI: flat_load_ushort [[LOAD0:v[0-9]+]] +; VI: flat_load_ushort [[LOAD1:v[0-9]+]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]] +; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]] +; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, <i16 -1, i16 -1> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}} +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_subrev_i16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}} +; VI-NOT: v_subrev_i16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, <i16 32, i16 0> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; The high element gives fp +; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split: +; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0 +; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}} + +; VI-NOT: v_subrev_i16 +; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}} +; VI-NOT: v_subrev_i16 +; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, +; VI: v_or_b32_e32 +define void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %add = sub <2 x i16> %a, <i16 0, i16 16256> + store <2 x i16> %add, <2 x i16> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] + +; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]] +; VI-NOT: and +; VI-NOT: shl +; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]] +; VI-NOT: and +; VI-NOT: shl +; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}} +define void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]] +; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GFX9: buffer_store_dwordx4 + +; VI: flat_load_ushort v[[A_LO:[0-9]+]] +; VI: flat_load_ushort v[[A_HI:[0-9]+]] +; VI: flat_load_ushort v[[B_LO:[0-9]+]] +; VI: flat_load_ushort v[[B_HI:[0-9]+]] + +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; VI: v_subrev_u16_e32 +; VI: v_subrev_u16_e32 +; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} + +; VI: buffer_store_dwordx4 +define void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = zext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32: +; GFX9: flat_load_dword [[A:v[0-9]+]] +; GFX9: flat_load_dword [[B:v[0-9]+]] + +; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]] +; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16 +; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]] +; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}} + +; VI: v_subrev_u16_e32 +; VI: v_subrev_u16_e32 +; VI: buffer_store_dwordx2 +define void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i32> + store <2 x i32> %ext, <2 x i32> addrspace(1)* %out + ret void +} + +; FIXME: Need to handle non-uniform case for function below (load without gep). +; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64: +; GCN: flat_load_dword +; GCN: flat_load_dword + +; GFX9: v_pk_sub_i16 +; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}} + +; VI: v_subrev_u16_e32 +; VI: v_subrev_u16_e32 + +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16 +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}} +define void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid + %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid + %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid + %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0 + %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1 + %add = sub <2 x i16> %a, %b + %ext = sext <2 x i16> %add to <2 x i64> + store <2 x i64> %ext, <2 x i64> addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } |