summaryrefslogtreecommitdiffstats
path: root/llvm/test/CodeGen/AMDGPU/idot4u.ll
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU/idot4u.ll')
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll215
1 files changed, 103 insertions, 112 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 3dea940ec3f..635c6b56f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -355,20 +355,20 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s2, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
@@ -387,20 +387,20 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
@@ -485,23 +485,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_movk_i32 s2, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s0, s2
+; GFX8-NEXT: s_and_b32 s3, s2, s0
+; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -509,23 +509,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@@ -533,23 +533,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-DL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@@ -619,19 +619,19 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -651,19 +651,19 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -765,19 +765,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT: s_and_b32 s3, s2, s0
-; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_and_b32 s0, s2, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -797,19 +797,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -829,19 +829,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-DL-NEXT: s_and_b32 s3, s1, s0
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -1268,33 +1268,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX8-NEXT: s_and_b32 s3, s2, s3
-; GFX8-NEXT: s_and_b32 s2, s2, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_lshr_b32 s0, s0, 24
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_sext_i32_i8 s4, s3
+; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX8-NEXT: s_lshr_b32 s3, s3, 24
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1302,33 +1299,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NODL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@@ -1336,33 +1330,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-DL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
OpenPOWER on IntegriCloud