diff options
Diffstat (limited to 'llvm')
7 files changed, 47 insertions, 39 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b57b2d2fd20..34643c99e11 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4369,12 +4369,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,    MVT IntVT = MVT::getIntegerVT(VecSize);    // Avoid stack access for dynamic indexing. -  SDValue Val = InsVal; -  if (InsVal.getValueType() == MVT::f16) -      Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); -    // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec -  SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + +  // Create a congruent vector with the target value in each element so that +  // the required element can be masked and ORed into the target vector. +  SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, +                               DAG.getSplatBuildVector(VecVT, SL, InsVal));    assert(isPowerOf2_32(EltSize));    SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll index e2741c25382..cee091af643 100644 --- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -814,8 +814,8 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {  }  ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16: -; GFX9: v_pk_mul_f16  ; GFX9: v_mul_f16_e32 +; GFX9: v_pk_mul_f16  ; GFX9-NOT: v_max  ; GFX9-NOT: v_pk_max  define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 80309b40e17..b311b6aa29d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -112,7 +112,10 @@ entry:  ; GCN-NOT: buffer_  ; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4  ; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00 +; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 +; GCN:     v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}  define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {  entry:    %v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel @@ -168,9 +171,10 @@ entry:  ; GCN-NOT: v_cndmask_b32  ; GCN-NOT: v_movrel  ; GCN-NOT: buffer_ +; GCN:     v_mov_b32_e32 [[K:v[0-9]+]], 0x10001  ; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4  ; GCN:     s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]] -; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}} +; GCN:     v_bfi_b32 v{{[0-9]+}}, [[V]], [[K]], v{{[0-9]+}}  define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {  entry:    %v = insertelement <2 x i16> %vec, i16 1, i32 %sel @@ -184,7 +188,10 @@ entry:  ; GCN-NOT: buffer_  ; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4  ; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x10001 +; GCN:     v_mov_b32_e32 [[V:v[0-9]+]], [[K]] +; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}} +; GCN:     v_bfi_b32 v{{[0-9]+}}, s{{[0-9]+}}, [[V]], v{{[0-9]+}}  define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {  entry:    %v = insertelement <4 x i16> %vec, i16 1, i32 %sel @@ -197,7 +204,11 @@ entry:  ; GCN-NOT: buffer_  ; GCN:     s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3  ; GCN:     s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]] -; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +; GCN:     s_mov_b32 [[K:s[0-9]+]], 0x1010101 +; GCN:     s_and_b32 s3, s1, [[K]] +; GCN:     s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]] +; GCN:     s_andn2_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}] +; GCN:     s_or_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], s[{{[0-9:]+}}]  define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {  entry:    %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 93ee16ea85d..47e080a94ba 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -242,7 +242,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %  ; VI-NOT: _load  ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3  ; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1 -; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]] +; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 0x505, [[MASK]]  ; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]  ; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]  ; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]] @@ -261,15 +261,14 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou  ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c  ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505  ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]  ; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3  ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: s_andn2_b32 [[AND_NOT_MASK:s[0-9]+]], [[LOAD]],  [[SHIFTED_MASK]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] -; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16 +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]] +; VI: v_lshrrev_b32_e32 [[V_HI2:v[0-9]+]], 16, [[BFI]] -; VI-DAG: buffer_store_short [[BFI]] -; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]] +; VI: buffer_store_short [[BFI]]  ; VI: buffer_store_byte [[V_HI2]]  define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {    %vecins = insertelement <3 x i8> %a, i8 5, i32 %b @@ -282,10 +281,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou  ; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c  ; VI-NOT: _load +; VI: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x5050505  ; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]  ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3  ; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]] -; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]] +; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], [[VAL]], [[V_LOAD]]  ; VI: buffer_store_dword [[BFI]]  define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {    %vecins = insertelement <4 x i8> %a, i8 5, i32 %b @@ -303,9 +303,11 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou  ; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3  ; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff  ; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] +; VI: s_mov_b32 [[VAL:s[0-9]+]], 0x5050505 +; VI: s_and_b32 s[[INS_HI:[0-9]+]], s[[MASK_SHIFT_HI]], [[VAL]] +; VI: s_and_b32 s[[INS_LO:[0-9]+]], s[[MASK_SHIFT_LO]], [[VAL]]  ; VI: s_andn2_b64 [[AND:s\[[0-9]+:[0-9]+\]]], [[VEC]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}} -; VI: s_and_b32 s[[INS:[0-9]+]], s[[MASK_SHIFT_LO]], 5 -; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS]]:[[MASK_HI]]{{\]}}, [[AND]] +; VI: s_or_b64 s{{\[}}[[RESULT0:[0-9]+]]:[[RESULT1:[0-9]+]]{{\]}}, s{{\[}}[[INS_LO]]:[[INS_HI]]{{\]}}, [[AND]]  ; VI: v_mov_b32_e32 v[[V_RESULT0:[0-9]+]], s[[RESULT0]]  ; VI: v_mov_b32_e32 v[[V_RESULT1:[0-9]+]], s[[RESULT1]]  ; VI: buffer_store_dwordx2 v{{\[}}[[V_RESULT0]]:[[V_RESULT1]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index b4fb59983cb..a8a298045ff 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -446,7 +446,7 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac  ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:  ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x12341234  ; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]  ; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]] @@ -611,25 +611,20 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out,  ; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]  ; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} -; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff  ; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0 -; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}} - -; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]] -; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]] -; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]] - -; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]] -; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]] -; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD - - -; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]] -; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]], -; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0, +; CIVI-DAG: s_and_b32 [[MASKED_VAL:s[0-9]+]], [[VAL]], s[[MASK_LO]] +; VI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[MASKED_VAL]], 16 +; CI-DAG: s_lshl_b32 [[SHIFTED_VAL:s[0-9]+]], [[VAL]], 16 +; CIVI: s_or_b32 [[DUP_VAL:s[0-9]+]], [[MASKED_VAL]], [[SHIFTED_VAL]] +; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]] +; GFX9-DAG: s_pack_ll_b32_b16 [[DUP_VAL:s[0-9]+]], [[VAL]], [[VAL]] +; GFX89: v_lshlrev_b64 v[{{[0-9:]+}}], [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}} +; CI: v_lshl_b64 v[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SCALED_IDX]] +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} +; GCN: v_bfi_b32 v{{[0-9]+}}, v{{[0-9]+}}, [[DUP_VAL]], v{{[0-9]+}} -; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}} +; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}  define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {    %tid = call i32 @llvm.amdgcn.workitem.id.x() #1    %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll index e35cf7a349a..34e1d201c9c 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll @@ -3,7 +3,7 @@  ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:  ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e703e7  ; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll index 07ee65526c9..95e38c36e62 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -6,7 +6,7 @@  ; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]  ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}} -; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7 +; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0x3e7  ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]  ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]  | 

