summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp63
-rw-r--r--llvm/test/CodeGen/AMDGPU/and.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/calling-conventions.ll5
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot2.ll40
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4s.ll133
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot4u.ll215
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8s.ll474
-rw-r--r--llvm/test/CodeGen/AMDGPU/idot8u.ll393
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad_uint24.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll3
-rw-r--r--llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll9
-rw-r--r--llvm/test/CodeGen/ARM/vdup.ll4
-rw-r--r--llvm/test/CodeGen/PowerPC/pr39478.ll4
-rw-r--r--llvm/test/CodeGen/PowerPC/testComparesigesll.ll4
-rw-r--r--llvm/test/CodeGen/X86/constant-combines.ll36
-rw-r--r--llvm/test/CodeGen/X86/extractelement-fp.ll6
-rw-r--r--llvm/test/CodeGen/X86/jump_sign.ll4
-rw-r--r--llvm/test/CodeGen/X86/legalize-shift-64.ll6
-rw-r--r--llvm/test/CodeGen/X86/masked_gather_scatter.ll2
-rw-r--r--llvm/test/CodeGen/X86/movmsk.ll7
-rw-r--r--llvm/test/CodeGen/X86/not-and-simplify.ll3
-rw-r--r--llvm/test/CodeGen/X86/oddshuffles.ll12
-rw-r--r--llvm/test/CodeGen/X86/pr28504.ll37
-rw-r--r--llvm/test/CodeGen/X86/pr33844.ll38
-rw-r--r--llvm/test/CodeGen/X86/sse3.ll8
-rw-r--r--llvm/test/CodeGen/X86/vec_extract-mmx.ll14
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll212
-rw-r--r--llvm/test/CodeGen/X86/widen_arith-3.ll3
32 files changed, 743 insertions, 1048 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ec8949071e8..2cd698eb7e0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -137,6 +137,10 @@ namespace {
/// them) when they are deleted from the underlying DAG. It relies on
/// stable indices of nodes within the worklist.
DenseMap<SDNode *, unsigned> WorklistMap;
+ /// This records all nodes attempted to add to the worklist since we
+ /// considered a new worklist entry. As we keep do not add duplicate nodes
+ /// in the worklist, this is different from the tail of the worklist.
+ SmallSetVector<SDNode *, 32> PruningList;
/// Set of nodes which have been combined (at least once).
///
@@ -154,6 +158,37 @@ namespace {
AddToWorklist(Node);
}
+ // Prune potentially dangling nodes. This is called after
+ // any visit to a node, but should also be called during a visit after any
+ // failed combine which may have created a DAG node.
+ void clearAddedDanglingWorklistEntries() {
+ // Check any nodes added to the worklist to see if they are prunable.
+ while (!PruningList.empty()) {
+ auto *N = PruningList.pop_back_val();
+ if (N->use_empty())
+ recursivelyDeleteUnusedNodes(N);
+ }
+ }
+
+ SDNode *getNextWorklistEntry() {
+ // Before we do any work, remove nodes that are not in use.
+ clearAddedDanglingWorklistEntries();
+ SDNode *N = nullptr;
+ // The Worklist holds the SDNodes in order, but it may contain null
+ // entries.
+ while (!N && !Worklist.empty()) {
+ N = Worklist.pop_back_val();
+ }
+
+ if (N) {
+ bool GoodWorklistEntry = WorklistMap.erase(N);
+ (void)GoodWorklistEntry;
+ assert(GoodWorklistEntry &&
+ "Found a worklist entry without a corresponding map entry!");
+ }
+ return N;
+ }
+
/// Call the node-specific routine that folds each particular type of node.
SDValue visit(SDNode *N);
@@ -171,6 +206,11 @@ namespace {
MaximumLegalStoreInBits = VT.getSizeInBits();
}
+ void ConsiderForPruning(SDNode *N) {
+ // Mark this for potential pruning.
+ PruningList.insert(N);
+ }
+
/// Add to the worklist making sure its instance is at the back (next to be
/// processed.)
void AddToWorklist(SDNode *N) {
@@ -182,6 +222,8 @@ namespace {
if (N->getOpcode() == ISD::HANDLENODE)
return;
+ ConsiderForPruning(N);
+
if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
Worklist.push_back(N);
}
@@ -189,6 +231,7 @@ namespace {
/// Remove all instances of N from the worklist.
void removeFromWorklist(SDNode *N) {
CombinedNodes.erase(N);
+ PruningList.remove(N);
auto It = WorklistMap.find(N);
if (It == WorklistMap.end())
@@ -654,8 +697,9 @@ public:
explicit WorklistInserter(DAGCombiner &dc)
: SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
- // This should eventually be pruning.
- void NodeInserted(SDNode *N) override { }
+ // FIXME: Ideally we could add N to the worklist, but this causes exponential
+ // compile time costs in large DAGs, e.g. Halide.
+ void NodeInserted(SDNode *N) override { DC.ConsiderForPruning(N); }
};
} // end anonymous namespace
@@ -1421,19 +1465,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
// changes of the root.
HandleSDNode Dummy(DAG.getRoot());
- // While the worklist isn't empty, find a node and try to combine it.
- while (!WorklistMap.empty()) {
- SDNode *N;
- // The Worklist holds the SDNodes in order, but it may contain null entries.
- do {
- N = Worklist.pop_back_val();
- } while (!N);
-
- bool GoodWorklistEntry = WorklistMap.erase(N);
- (void)GoodWorklistEntry;
- assert(GoodWorklistEntry &&
- "Found a worklist entry without a corresponding map entry!");
-
+ // While we have a valid worklist entry node, try to combine it.
+ while (SDNode *N = getNextWorklistEntry()) {
// If N has no uses, it is dead. Make sure to revisit all N's operands once
// N is deleted from the DAG, since they too may now be dead or may have a
// reduced number of uses, allowing other xforms.
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 739e6c1c92c..57f6da5d9ee 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -405,7 +405,7 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 ad
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 1.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
; SI-NOT: and
@@ -420,7 +420,7 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -1.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
; SI-NOT: and
@@ -435,7 +435,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out,
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0.5
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
; SI-NOT: and
@@ -450,7 +450,7 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -0.5
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
; SI-NOT: and
@@ -463,7 +463,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out,
; FUNC-LABEL: {{^}}s_and_inline_imm_2.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
; SI-NOT: and
@@ -476,7 +476,7 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64
; FUNC-LABEL: {{^}}s_and_inline_imm_neg_2.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
; SI-NOT: and
@@ -491,7 +491,7 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out,
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 4.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
; SI-NOT: and
@@ -506,7 +506,7 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64
; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, -4.0
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
; SI-NOT: and
@@ -549,7 +549,7 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %o
; Shift into upper 32-bits
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
; SI-NOT: and
@@ -562,7 +562,7 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %
; FUNC-LABEL: {{^}}s_and_inline_high_imm_f32_neg_4.0_i64:
; SI: s_load_dwordx2
-; SI: s_load_dwordx2
+; SI: s_load_dword
; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
; SI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 1409f843cda..748222529d7 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -181,11 +181,10 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
}
; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
-; VI: s_lshr_b32 s1, s0, 16
-; VI: s_add_i32 s1, s1, 1
+; VI: s_and_b32 s1, s0, 0xffff0000
; VI: s_add_i32 s0, s0, 1
+; VI: s_add_i32 s1, s1, 0x10000
; VI: s_and_b32 s0, s0, 0xffff
-; VI: s_lshl_b32 s1, s1, 16
; VI: s_or_b32 s0, s0, s1
; VI: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index e57ebc9c061..d705f319437 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1366,7 +1366,7 @@ define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrs
; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
-; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], [[A]], -[[B]], -[[C]]
; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[NEG_MAD]]
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index c923d1c0bc7..7ca2d842fd2 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -2178,23 +2178,23 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT: s_mov_b32 s0, 0xffff
+; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_and_b32 s0, s2, s0
-; GFX8-NEXT: s_lshr_b32 s2, s2, 16
+; GFX8-NEXT: s_and_b32 s3, s1, s2
; GFX8-NEXT: s_lshr_b32 s1, s1, 16
-; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: s_and_b32 s2, s0, s2
+; GFX8-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -2202,23 +2202,23 @@ define amdgpu_kernel void @udot2_acc16(<2 x i16> addrspace(1)* %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
+; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 16
+; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v3, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 2c5873ea17a..3d84292f696 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -201,38 +201,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX8-NEXT: s_mov_b32 s0, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s5, s1, 8
-; GFX8-NEXT: s_lshr_b32 s6, s2, 8
-; GFX8-NEXT: s_sext_i32_i8 s4, s2
-; GFX8-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX8-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX8-NEXT: s_lshr_b32 s2, s2, 24
-; GFX8-NEXT: v_mov_b32_e32 v3, s4
-; GFX8-NEXT: s_sext_i32_i8 s3, s1
-; GFX8-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: s_and_b32 s4, s0, s5
-; GFX8-NEXT: s_and_b32 s5, s0, s6
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX8-NEXT: s_sext_i32_i8 s0, s2
+; GFX8-NEXT: s_sext_i32_i8 s1, s3
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_bfe_i32 s4, s3, 0x80008
+; GFX8-NEXT: s_bfe_i32 s5, s3, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_bfe_i32 s1, s2, 0x80008
+; GFX8-NEXT: s_bfe_i32 s4, s2, 0x80010
+; GFX8-NEXT: s_ashr_i32 s3, s3, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: s_and_b32 s1, s0, s1
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_and_b32 s0, s0, s2
+; GFX8-NEXT: s_ashr_i32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s0
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -241,38 +232,29 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-NODL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_lshr_b32 s5, s1, 8
-; GFX9-NODL-NEXT: s_lshr_b32 s6, s2, 8
-; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s2
-; GFX9-NODL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-NODL-NEXT: s_sext_i32_i8 s3, s1
-; GFX9-NODL-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: s_and_b32 s4, s0, s5
-; GFX9-NODL-NEXT: s_and_b32 s5, s0, s6
-; GFX9-NODL-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX9-NODL-NEXT: s_bfe_i32 s2, s2, 0x80000
+; GFX9-NODL-NEXT: s_sext_i32_i8 s0, s2
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s3
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_bfe_i32 s4, s3, 0x80008
+; GFX9-NODL-NEXT: s_bfe_i32 s5, s3, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: s_bfe_i32 s1, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_i32 s4, s2, 0x80010
+; GFX9-NODL-NEXT: s_ashr_i32 s3, s3, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NODL-NEXT: s_and_b32 s1, s0, s1
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NODL-NEXT: s_and_b32 s0, s0, s2
+; GFX9-NODL-NEXT: s_ashr_i32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s4, v5, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@@ -281,38 +263,15 @@ define amdgpu_kernel void @idot4_acc16(<4 x i8> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
-; GFX9-DL-NEXT: s_mov_b32 s0, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 8
-; GFX9-DL-NEXT: s_lshr_b32 s6, s2, 8
-; GFX9-DL-NEXT: s_sext_i32_i8 s4, s2
-; GFX9-DL-NEXT: s_bfe_i32 s5, s5, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s6, s6, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
-; GFX9-DL-NEXT: s_sext_i32_i8 s3, s1
-; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: s_and_b32 s4, s0, s5
-; GFX9-DL-NEXT: s_and_b32 s5, s0, s6
-; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x80000
-; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x80000
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-DL-NEXT: s_and_b32 s1, s0, s1
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-DL-NEXT: s_and_b32 s0, s0, s2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s3, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v4, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
+; GFX9-DL-NEXT: v_dot4_i32_i8 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@@ -399,20 +358,20 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s2, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
@@ -431,20 +390,20 @@ define amdgpu_kernel void @idot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 3dea940ec3f..635c6b56f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -355,20 +355,20 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s2, s0, s2
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s0, s0, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
@@ -387,20 +387,20 @@ define amdgpu_kernel void @udot4_acc8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s0, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
@@ -485,23 +485,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_movk_i32 s2, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ubyte v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_and_b32 s3, s1, s2
-; GFX8-NEXT: s_and_b32 s2, s0, s2
+; GFX8-NEXT: s_and_b32 s3, s2, s0
+; GFX8-NEXT: s_and_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX8-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX8-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -509,23 +509,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_movk_i32 s2, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-NODL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-NODL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-NODL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@@ -533,23 +533,23 @@ define amdgpu_kernel void @udot2_8(<4 x i8> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_movk_i32 s2, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
+; GFX9-DL-NEXT: s_load_dword s1, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0
+; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_and_b32 s3, s1, s2
-; GFX9-DL-NEXT: s_and_b32 s2, s0, s2
+; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x80008
; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x80008
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v3, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
@@ -619,19 +619,19 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s3, s1, s0
-; GFX8-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX8-NEXT: s_and_b32 s0, s2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s3
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v4, s5
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -651,19 +651,19 @@ define amdgpu_kernel void @udot4_CommutationInsideMAD(<4 x i8> addrspace(1)* %sr
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s1, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s5
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -765,19 +765,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX8-NEXT: s_movk_i32 s0, 0xff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX8-NEXT: s_and_b32 s3, s2, s0
-; GFX8-NEXT: s_and_b32 s0, s1, s0
+; GFX8-NEXT: s_and_b32 s3, s1, s0
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v4, s0
+; GFX8-NEXT: s_and_b32 s0, s2, s0
+; GFX8-NEXT: v_mov_b32_e32 v4, s3
; GFX8-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX8-NEXT: s_lshr_b32 s1, s1, 24
; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_lshr_b32 s2, s2, 24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -797,19 +797,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-NODL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-NODL-NEXT: s_and_b32 s3, s1, s0
; GFX9-NODL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-NODL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NODL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -829,19 +829,19 @@ define amdgpu_kernel void @udot4_CommutationAccrossMADs(<4 x i8> addrspace(1)* %
; GFX9-DL-NEXT: s_movk_i32 s0, 0xff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: s_bfe_u32 s4, s1, 0x80008
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s0
-; GFX9-DL-NEXT: s_and_b32 s0, s1, s0
+; GFX9-DL-NEXT: s_and_b32 s3, s1, s0
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s0
+; GFX9-DL-NEXT: s_and_b32 s0, s2, s0
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s3
; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x80010
; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v4, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
@@ -1268,33 +1268,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX8-NEXT: s_mov_b32 s2, 0xffff
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
-; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX8-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX8-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX8-NEXT: s_and_b32 s3, s2, s3
-; GFX8-NEXT: s_and_b32 s2, s2, s4
-; GFX8-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX8-NEXT: v_mov_b32_e32 v5, s2
-; GFX8-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX8-NEXT: s_lshr_b32 s1, s1, 24
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: s_lshr_b32 s0, s0, 24
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX8-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX8-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_sext_i32_i8 s4, s3
+; GFX8-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX8-NEXT: v_mov_b32_e32 v4, s4
+; GFX8-NEXT: s_sext_i32_i8 s1, s2
+; GFX8-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX8-NEXT: s_lshr_b32 s3, s3, 24
+; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_lshr_b32 s2, s2, 24
+; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, s3
+; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -1302,33 +1299,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-NODL: ; %bb.0: ; %entry
; GFX9-NODL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-NODL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-NODL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NODL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-NODL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NODL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NODL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-NODL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-NODL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-NODL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-NODL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-NODL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-NODL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-NODL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-NODL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-NODL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NODL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-NODL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-NODL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-NODL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-NODL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-NODL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-NODL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NODL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-NODL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-NODL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-NODL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NODL-NEXT: global_store_short v[0:1], v2, off
; GFX9-NODL-NEXT: s_endpgm
;
@@ -1336,33 +1330,30 @@ define amdgpu_kernel void @notdot4_mixedtypes(<4 x i8> addrspace(1)* %src1,
; GFX9-DL: ; %bb.0: ; %entry
; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GFX9-DL-NEXT: s_mov_b32 s2, 0xffff
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0
+; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
-; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
-; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_bfe_i32 s3, s0, 0x80000
-; GFX9-DL-NEXT: s_bfe_u32 s6, s1, 0x80008
-; GFX9-DL-NEXT: s_bfe_i32 s4, s1, 0x80000
-; GFX9-DL-NEXT: s_and_b32 s3, s2, s3
-; GFX9-DL-NEXT: s_and_b32 s2, s2, s4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x80008
-; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6
-; GFX9-DL-NEXT: s_bfe_u32 s8, s1, 0x80010
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s2
-; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x80010
-; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 24
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s8
-; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 24
-; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v4, v2
+; GFX9-DL-NEXT: s_bfe_u32 s0, s2, 0x80008
+; GFX9-DL-NEXT: s_bfe_u32 s1, s3, 0x80008
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT: s_sext_i32_i8 s4, s3
+; GFX9-DL-NEXT: s_bfe_u32 s5, s3, 0x80010
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-DL-NEXT: s_sext_i32_i8 s1, s2
+; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x80010
+; GFX9-DL-NEXT: s_lshr_b32 s3, s3, 24
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 24
+; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v5, v2
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<4 x i8> addrspace(1)* %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index a805bb7098e..6235c1d641b 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -308,52 +308,43 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_load_ushort v2, v[0:1]
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s0, s2, 4
-; GFX8-NEXT: s_lshr_b32 s1, s4, 4
-; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX8-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
-; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX8-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX8-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX8-NEXT: v_mov_b32_e32 v3, s1
+; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX8-NEXT: s_lshr_b32 s1, s2, 12
-; GFX8-NEXT: s_lshr_b32 s5, s4, 12
-; GFX8-NEXT: v_mov_b32_e32 v6, s0
-; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX8-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX8-NEXT: s_lshr_b32 s0, s2, 20
-; GFX8-NEXT: s_lshr_b32 s1, s4, 20
-; GFX8-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v13, s5
-; GFX8-NEXT: s_lshr_b32 s0, s2, 28
-; GFX8-NEXT: s_lshr_b32 s9, s4, 28
-; GFX8-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX8-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX8-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX8-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX8-NEXT: s_lshr_b32 s7, s4, 12
+; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX8-NEXT: v_mov_b32_e32 v4, s6
+; GFX8-NEXT: v_mov_b32_e32 v7, s5
+; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX8-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX8-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX8-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v8, s10
+; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v9, s12
+; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX8-NEXT: s_ashr_i32 s4, s4, 28
+; GFX8-NEXT: v_mov_b32_e32 v10, s14
+; GFX8-NEXT: s_ashr_i32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v6, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX8-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX8-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -368,52 +359,43 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s0, s2, 4
-; GFX9-NEXT: s_lshr_b32 s1, s4, 4
-; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX9-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX9-NEXT: s_lshr_b32 s1, s2, 12
-; GFX9-NEXT: s_lshr_b32 s5, s4, 12
-; GFX9-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX9-NEXT: s_lshr_b32 s0, s2, 20
-; GFX9-NEXT: s_lshr_b32 s1, s4, 20
-; GFX9-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v13, s5
-; GFX9-NEXT: s_lshr_b32 s0, s2, 28
-; GFX9-NEXT: s_lshr_b32 s9, s4, 28
-; GFX9-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX9-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX9-NEXT: s_lshr_b32 s7, s4, 12
+; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX9-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX9-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX9-NEXT: global_store_short v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
@@ -428,52 +410,43 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1
; GFX9-DL-NEXT: global_load_ushort v2, v[0:1], off
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 4
-; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 4
-; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40000
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1
-; GFX9-DL-NEXT: s_bfe_i32 s0, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
-; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s0, s2, 0x40000
+; GFX9-DL-NEXT: s_bfe_i32 s1, s4, 0x40000
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40008
; GFX9-DL-NEXT: s_lshr_b32 s1, s2, 12
-; GFX9-DL-NEXT: s_lshr_b32 s5, s4, 12
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s0
-; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s1
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v6, s7, v6
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 20
-; GFX9-DL-NEXT: s_lshr_b32 s1, s4, 20
-; GFX9-DL-NEXT: s_bfe_i32 s5, s4, 0x40010
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s1
-; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v13, s5
-; GFX9-DL-NEXT: s_lshr_b32 s0, s2, 28
-; GFX9-DL-NEXT: s_lshr_b32 s9, s4, 28
-; GFX9-DL-NEXT: s_bfe_i32 s4, s4, 0x40018
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s0
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s9
-; GFX9-DL-NEXT: s_bfe_i32 s2, s2, 0x40018
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v12, 12, v12
+; GFX9-DL-NEXT: s_lshr_b32 s7, s4, 12
+; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s1
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s7
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, s9, v4
+; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40010
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
+; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s12
+; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s14
+; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v7, v8, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v13, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v9, v10, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, v5, v6, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s2, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v11, v12, v2
; GFX9-DL-NEXT: global_store_short v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
@@ -622,60 +595,45 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_lshr_b32 s7, s0, 4
-; GFX8-NEXT: s_lshr_b32 s11, s1, 4
-; GFX8-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX8-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX8-NEXT: s_lshr_b32 s4, s0, 12
+; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX8-NEXT: s_lshr_b32 s5, s1, 12
+; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX8-NEXT: v_mov_b32_e32 v6, s7
+; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX8-NEXT: v_mov_b32_e32 v3, s11
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX8-NEXT: s_lshr_b32 s6, s0, 12
-; GFX8-NEXT: s_lshr_b32 s10, s1, 12
-; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX8-NEXT: v_mov_b32_e32 v12, s13
-; GFX8-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX8-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s15
-; GFX8-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX8-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX8-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX8-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX8-NEXT: s_lshr_b32 s5, s0, 20
-; GFX8-NEXT: s_lshr_b32 s9, s1, 20
-; GFX8-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX8-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX8-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX8-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX8-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX8-NEXT: s_lshr_b32 s8, s1, 28
-; GFX8-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX8-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX8-NEXT: s_lshr_b32 s4, s0, 28
-; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v13, s17
-; GFX8-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX8-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX8-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX8-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX8-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX8-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX8-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX8-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX8-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX8-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX8-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v8, s13
+; GFX8-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v9, s15
+; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX8-NEXT: s_ashr_i32 s1, s1, 28
+; GFX8-NEXT: v_mov_b32_e32 v10, s17
+; GFX8-NEXT: s_ashr_i32 s0, s0, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX8-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX8-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX8-NEXT: flat_store_byte v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -691,60 +649,45 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-NEXT: s_lshr_b32 s11, s1, 4
-; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-NEXT: s_lshr_b32 s4, s0, 12
+; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX9-NEXT: s_lshr_b32 s5, s1, 12
+; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX9-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX9-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-NEXT: s_lshr_b32 s6, s0, 12
-; GFX9-NEXT: s_lshr_b32 s10, s1, 12
-; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX9-NEXT: v_mov_b32_e32 v12, s13
-; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s15
-; GFX9-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX9-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-NEXT: s_lshr_b32 s5, s0, 20
-; GFX9-NEXT: s_lshr_b32 s9, s1, 20
-; GFX9-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX9-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX9-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX9-NEXT: s_lshr_b32 s8, s1, 28
-; GFX9-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-NEXT: s_lshr_b32 s4, s0, 28
-; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX9-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX9-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX9-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX9-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX9-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX9-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v8, s13
+; GFX9-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v9, s15
+; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX9-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-NEXT: v_mov_b32_e32 v10, s17
+; GFX9-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX9-NEXT: global_store_byte v[0:1], v2, off
; GFX9-NEXT: s_endpgm
;
@@ -760,60 +703,45 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0
; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4
-; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s7
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s11
-; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40000
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3
+; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 12
+; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40000
+; GFX9-DL-NEXT: s_lshr_b32 s5, s1, 12
+; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40008
+; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40000
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4
+; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s5
+; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004
+; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008
+; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4
-; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 12
-; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 12
-; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40008
-; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40000
-; GFX9-DL-NEXT: v_mov_b32_e32 v12, s13
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s6
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s10
-; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15
-; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3
+; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5
+; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3
+; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40010
; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v6, 12, v6
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7
-; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 20
-; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 20
-; GFX9-DL-NEXT: v_mul_i32_i24_e32 v5, s14, v5
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s5
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s9
-; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40010
-; GFX9-DL-NEXT: v_and_b32_e32 v6, s2, v6
-; GFX9-DL-NEXT: v_and_b32_e32 v7, s2, v7
-; GFX9-DL-NEXT: s_lshr_b32 s8, s1, 28
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v8, 12, v8
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v9, 12, v9
-; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 28
-; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v13, s17
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4
-; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s8
-; GFX9-DL-NEXT: s_bfe_i32 s1, s1, 0x40018
-; GFX9-DL-NEXT: v_and_b32_e32 v8, s2, v8
-; GFX9-DL-NEXT: v_and_b32_e32 v9, s2, v9
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10
-; GFX9-DL-NEXT: v_ashrrev_i16_e32 v11, 12, v11
-; GFX9-DL-NEXT: s_bfe_i32 s0, s0, 0x40018
-; GFX9-DL-NEXT: v_and_b32_e32 v10, s2, v10
-; GFX9-DL-NEXT: v_and_b32_e32 v11, s2, v11
+; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5
+; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40014
+; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13
+; GFX9-DL-NEXT: s_bfe_i32 s17, s1, 0x40018
+; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15
+; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018
+; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28
+; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17
+; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v12, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2
-; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v6, v7, v2
-; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v13, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v8, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v6, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2
+; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2
+; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, v10, v11, v2
; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off
; GFX9-DL-NEXT: s_endpgm
<8 x i4> addrspace(1)* %src2,
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index b480ac22ea9..3fc4b93d1b0 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -819,35 +819,35 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -870,35 +870,35 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
@@ -921,35 +921,35 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
@@ -1074,32 +1074,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX8-NEXT: v_mov_b32_e32 v5, s6
; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s7
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s9
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s10
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -1122,32 +1125,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
@@ -1170,32 +1176,35 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s6
; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s10, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s9
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v5, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v9, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
+; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
@@ -2336,35 +2345,35 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX8-NEXT: v_mov_b32_e32 v3, s1
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX8-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v4, s6
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX8-NEXT: v_mov_b32_e32 v5, s5
+; GFX8-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v4, s5
; GFX8-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX8-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX8-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX8-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX8-NEXT: v_mov_b32_e32 v6, s5
-; GFX8-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX8-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX8-NEXT: v_mov_b32_e32 v7, s7
-; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX8-NEXT: v_mov_b32_e32 v8, s8
-; GFX8-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX8-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX8-NEXT: v_mov_b32_e32 v5, s7
+; GFX8-NEXT: v_mov_b32_e32 v6, s6
+; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX8-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX8-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX8-NEXT: v_mov_b32_e32 v7, s9
+; GFX8-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX8-NEXT: v_mov_b32_e32 v8, s11
+; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX8-NEXT: s_lshr_b32 s4, s4, 28
-; GFX8-NEXT: v_mov_b32_e32 v9, s9
+; GFX8-NEXT: v_mov_b32_e32 v9, s13
; GFX8-NEXT: s_lshr_b32 s2, s2, 28
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX8-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX8-NEXT: v_add_u32_e32 v2, vcc, v5, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX8-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX8-NEXT: v_mov_b32_e32 v3, s4
; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX8-NEXT: v_and_b32_e32 v2, 15, v2
@@ -2387,35 +2396,35 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v4, s5
; GFX9-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-NEXT: v_mov_b32_e32 v9, s13
; GFX9-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-NEXT: v_mov_b32_e32 v3, s4
; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-NEXT: v_and_b32_e32 v2, 15, v2
@@ -2438,35 +2447,35 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1
; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x40004
; GFX9-DL-NEXT: s_bfe_u32 s6, s4, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v4, s6
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40008
-; GFX9-DL-NEXT: v_mov_b32_e32 v5, s5
+; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v4, s5
; GFX9-DL-NEXT: s_bfe_u32 s1, s2, 0x40004
-; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, s7, v4
-; GFX9-DL-NEXT: s_bfe_u32 s5, s4, 0x4000c
-; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v4
-; GFX9-DL-NEXT: s_bfe_u32 s7, s4, 0x40010
-; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5
-; GFX9-DL-NEXT: s_bfe_u32 s6, s2, 0x4000c
-; GFX9-DL-NEXT: s_bfe_u32 s8, s4, 0x40014
-; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7
-; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010
-; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40018
-; GFX9-DL-NEXT: v_mov_b32_e32 v8, s8
-; GFX9-DL-NEXT: s_bfe_u32 s7, s2, 0x40014
-; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40008
+; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x4000c
+; GFX9-DL-NEXT: v_mov_b32_e32 v5, s7
+; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, s8, v5
+; GFX9-DL-NEXT: s_bfe_u32 s9, s4, 0x40010
+; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v5
+; GFX9-DL-NEXT: s_bfe_u32 s11, s4, 0x40014
+; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40010
+; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9
+; GFX9-DL-NEXT: s_bfe_u32 s13, s4, 0x40018
+; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40014
+; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11
+; GFX9-DL-NEXT: s_bfe_u32 s14, s2, 0x40018
; GFX9-DL-NEXT: s_lshr_b32 s4, s4, 28
-; GFX9-DL-NEXT: v_mov_b32_e32 v9, s9
+; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13
; GFX9-DL-NEXT: s_lshr_b32 s2, s2, 28
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v5, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s1, v4, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v6, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v4
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v8, v2
-; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v9, v2
+; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v5
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s10, v7, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s12, v8, v2
+; GFX9-DL-NEXT: v_mad_u32_u24 v2, s14, v9, v2
; GFX9-DL-NEXT: v_mov_b32_e32 v3, s4
; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2
; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
index 3c3371bf916..5f109624daf 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,8 +1,8 @@
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN1
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN --check-prefix=GCN2
declare i32 @llvm.r600.read.tidig.x() nounwind readnone
@@ -30,8 +30,12 @@ entry:
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 16
; FIXME: Should be using scalar instructions here.
-; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+; GCN1: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN1: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
+; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
+; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
+; GCN2: s_sext_i32_i16 s0, [[MAD]]
+; GCN2: v_mov_b32_e32 v0, s0
define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
entry:
%0 = mul i16 %a, %b
@@ -47,8 +51,12 @@ entry:
; The result must be sign-extended
; EG: BFE_INT {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[MAD_CHAN]], 0.0, literal.x
; EG: 8
-; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
-; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+; GCN1: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
+; GCN1: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
+; GCN2: s_mul_i32 [[MUL:s[0-9]]], {{[s][0-9], [s][0-9]}}
+; GCN2: s_add_i32 [[MAD:s[0-9]]], [[MUL]], s{{[0-9]}}
+; GCN2: s_sext_i32_i8 s0, [[MAD]]
+; GCN2: v_mov_b32_e32 v0, s0
define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
entry:
%0 = mul i8 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 26e6a3a52ea..b08d54c7911 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -307,10 +307,10 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
}
; GCN-LABEL: {{^}}and_not_mask_i64:
-; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
+; GCN-DAG: buffer_load_dword v[[VAL:[0-9]+]]
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
-; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VAL]]
; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
; GCN-NOT: v[[SHRLO]]
; GCN-NOT: v[[SHRHI]]
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
index f3faa39c64e..2f93efec69b 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -54,9 +54,9 @@ define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)*
; after 64-bit shift is split.
; GCN-LABEL: {{^}}lshr_and_i64_35:
-; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
-; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN: buffer_load_dword v[[LO:[0-9]+]]
+; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[LO]], 8, 23
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%val = load i64, i64 addrspace(1)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
index abc1df0a8ef..26b22ee9bd8 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -384,9 +384,11 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
ret void
}
+; FIXME: This or should fold into an offset on the write
; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
-; GCN: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:32
+; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
+; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}} offset:64
diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
index c950e2d7cd3..c5c4476d20f 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll
@@ -27,7 +27,6 @@ define amdgpu_kernel void @widen_i16_constant_load(i16 addrspace(4)* %arg) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -439,7 +438,6 @@ define amdgpu_kernel void @widen_i16_constant32_load(i16 addrspace(6)* %arg) {
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 4
; VI-NEXT: v_mov_b32_e32 v2, s0
@@ -477,7 +475,6 @@ define amdgpu_kernel void @widen_i16_global_invariant_load(i16 addrspace(1)* %ar
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s0, s0, 0xffff
; VI-NEXT: s_addk_i32 s0, 0x3e7
; VI-NEXT: s_or_b32 s0, s0, 1
; VI-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll b/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
index 62242d60787..d44abf7bcd3 100644
--- a/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
+++ b/llvm/test/CodeGen/ARM/CGP/arm-cgp-overflow.ll
@@ -85,10 +85,11 @@ define i32 @overflow_add_positive_const_limit(i8 zeroext %a) {
}
; CHECK-LABEL: unsafe_add_underflow:
-; CHECK: subs r0, #2
-; CHECK: uxtb [[EXT:r[0-9]+]], r0
-; CHECK: cmp [[EXT]], #255
-; CHECK: moveq r0, #8
+; CHECK: movs r1, #16
+; CHECK: cmp r0, #1
+; CHECK: it eq
+; CHECK: moveq r1, #8
+; CHECK: mov r0, r1
define i32 @unsafe_add_underflow(i8 zeroext %a) {
%add = add i8 %a, -2
%cmp = icmp ugt i8 %add, 254
diff --git a/llvm/test/CodeGen/ARM/vdup.ll b/llvm/test/CodeGen/ARM/vdup.ll
index 5127dab2656..74ee4913b5e 100644
--- a/llvm/test/CodeGen/ARM/vdup.ll
+++ b/llvm/test/CodeGen/ARM/vdup.ll
@@ -430,7 +430,6 @@ define <2 x float> @check_f32(<4 x float> %v) nounwind {
; CHECK-LABEL: check_f32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
@@ -444,7 +443,6 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
; CHECK-LABEL: check_i32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d17, r2, r3
-; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.32 d16, d17[1]
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: mov pc, lr
@@ -457,7 +455,6 @@ define <2 x i32> @check_i32(<4 x i32> %v) nounwind {
define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
; CHECK-LABEL: check_i16:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.16 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
@@ -471,7 +468,6 @@ define <4 x i16> @check_i16(<8 x i16> %v) nounwind {
define <8 x i8> @check_i8(<16 x i8> %v) nounwind {
; CHECK-LABEL: check_i8:
; CHECK: @ %bb.0:
-; CHECK-NEXT: vmov d17, r2, r3
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: vdup.8 d16, d16[3]
; CHECK-NEXT: vmov r0, r1, d16
diff --git a/llvm/test/CodeGen/PowerPC/pr39478.ll b/llvm/test/CodeGen/PowerPC/pr39478.ll
index a41e1009d63..0159ecb8707 100644
--- a/llvm/test/CodeGen/PowerPC/pr39478.ll
+++ b/llvm/test/CodeGen/PowerPC/pr39478.ll
@@ -5,13 +5,13 @@
define void @pr39478(i64* %p64, i32* %p32) {
; CHECKLE-LABEL: pr39478:
; CHECKLE: # %bb.0: # %entry
-; CHECKLE-NEXT: lwz 3, 4(3)
+; CHECKLE-NEXT: lbz 3, 4(3)
; CHECKLE-NEXT: stb 3, 0(4)
; CHECKLE-NEXT: blr
;
; CHECKBE-LABEL: pr39478:
; CHECKBE: # %bb.0: # %entry
-; CHECKBE-NEXT: lwz 3, 0(3)
+; CHECKBE-NEXT: lbz 3, 3(3)
; CHECKBE-NEXT: stb 3, 3(4)
; CHECKBE-NEXT: blr
entry:
diff --git a/llvm/test/CodeGen/PowerPC/testComparesigesll.ll b/llvm/test/CodeGen/PowerPC/testComparesigesll.ll
index 2edbdc5bb80..0f0d792edcf 100644
--- a/llvm/test/CodeGen/PowerPC/testComparesigesll.ll
+++ b/llvm/test/CodeGen/PowerPC/testComparesigesll.ll
@@ -99,14 +99,14 @@ define signext i32 @test_igesll_sext_z(i64 %a) {
; CHECK-NEXT: blr
; CHECK-BE-LABEL: test_igesll_sext_z:
; CHECK-BE: # %bb.0: # %entry
-; CHECK-BE-NEXT: sradi r3, r3, 63
; CHECK-BE-NEXT: not r3, r3
+; CHECK-BE-NEXT: sradi r3, r3, 63
; CHECK-BE-NEXT: blr
;
; CHECK-LE-LABEL: test_igesll_sext_z:
; CHECK-LE: # %bb.0: # %entry
-; CHECK-LE-NEXT: sradi r3, r3, 63
; CHECK-LE-NEXT: not r3, r3
+; CHECK-LE-NEXT: sradi r3, r3, 63
; CHECK-LE-NEXT: blr
entry:
%cmp = icmp sgt i64 %a, -1
diff --git a/llvm/test/CodeGen/X86/constant-combines.ll b/llvm/test/CodeGen/X86/constant-combines.ll
index 20fbedb1574..45bc635bb67 100644
--- a/llvm/test/CodeGen/X86/constant-combines.ll
+++ b/llvm/test/CodeGen/X86/constant-combines.ll
@@ -4,42 +4,6 @@
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
-define void @PR22524({ float, float }* %arg) {
-; Check that we can materialize the zero constants we store in two places here,
-; and at least form a legal store of the floating point value at the end.
-; The DAG combiner at one point contained bugs that given enough permutations
-; would incorrectly form an illegal operation for the last of these stores when
-; it folded it to a zero too late to legalize the zero store operation. If this
-; ever starts forming a zero store instead of movss, the test case has stopped
-; being useful.
-;
-; CHECK-LABEL: PR22524:
-; CHECK: # %bb.0: # %entry
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: movd %eax, %xmm0
-; CHECK-NEXT: xorps %xmm1, %xmm1
-; CHECK-NEXT: mulss %xmm0, %xmm1
-; CHECK-NEXT: movl $0, (%rdi)
-; CHECK-NEXT: movss %xmm1, 4(%rdi)
-; CHECK-NEXT: retq
-entry:
- %0 = getelementptr inbounds { float, float }, { float, float }* %arg, i32 0, i32 1
- store float 0.000000e+00, float* %0, align 4
- %1 = getelementptr inbounds { float, float }, { float, float }* %arg, i64 0, i32 0
- %2 = bitcast float* %1 to i64*
- %3 = load i64, i64* %2, align 8
- %4 = trunc i64 %3 to i32
- %5 = lshr i64 %3, 32
- %6 = trunc i64 %5 to i32
- %7 = bitcast i32 %6 to float
- %8 = fmul float %7, 0.000000e+00
- %9 = bitcast float* %1 to i32*
- store i32 %6, i32* %9, align 4
- store float %8, float* %0, align 4
- ret void
-}
-
-
define void @bitstore_fold() {
; CHECK-LABEL: bitstore_fold:
; CHECK: # %bb.0: # %BB
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 20f6c45ef8f..028f9a61fac 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -159,10 +159,8 @@ define void @extsetcc(<4 x float> %x) {
; CHECK-LABEL: extsetcc:
; CHECK: # %bb.0:
; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; CHECK-NEXT: vcmpnleps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT: vextractps $0, %xmm0, %eax
-; CHECK-NEXT: andl $1, %eax
-; CHECK-NEXT: movb %al, (%rax)
+; CHECK-NEXT: vucomiss %xmm1, %xmm0
+; CHECK-NEXT: setb (%rax)
; CHECK-NEXT: retq
%cmp = fcmp ult <4 x float> %x, zeroinitializer
%sext = sext <4 x i1> %cmp to <4 x i32>
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 5ca3b2d088d..5243546697c 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -238,8 +238,8 @@ define void @func_o() nounwind uwtable {
; CHECK-NEXT: movzwl (%eax), %eax
; CHECK-NEXT: movzwl %ax, %eax
; CHECK-NEXT: imull $52429, %eax, %ecx # imm = 0xCCCD
-; CHECK-NEXT: shrl $19, %ecx
-; CHECK-NEXT: addl %ecx, %ecx
+; CHECK-NEXT: shrl $18, %ecx
+; CHECK-NEXT: andl $-2, %ecx
; CHECK-NEXT: leal (%ecx,%ecx,4), %ecx
; CHECK-NEXT: cmpw %cx, %ax
; CHECK-NEXT: jne .LBB12_5
diff --git a/llvm/test/CodeGen/X86/legalize-shift-64.ll b/llvm/test/CodeGen/X86/legalize-shift-64.ll
index 8e549feb208..0147de78a8a 100644
--- a/llvm/test/CodeGen/X86/legalize-shift-64.ll
+++ b/llvm/test/CodeGen/X86/legalize-shift-64.ll
@@ -143,9 +143,9 @@ define i32 @test6() {
; CHECK-NEXT: subl $16, %esp
; CHECK-NEXT: movl $1, {{[0-9]+}}(%esp)
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT: xorl %eax, %eax
-; CHECK-NEXT: orl $0, %eax
-; CHECK-NEXT: je .LBB5_3
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: testb %al, %al
+; CHECK-NEXT: jne .LBB5_3
; CHECK-NEXT: # %bb.1: # %if.then
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: jmp .LBB5_2
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 9406125d70d..8ee23d6feff 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -2645,9 +2645,9 @@ define <8 x float> @sext_v8i8_index(float* %base, <8 x i8> %ind) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpslld $24, %ymm0, %ymm0
; SKX_32-NEXT: vpsrad $24, %ymm0, %ymm1
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1}
; SKX_32-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/movmsk.ll b/llvm/test/CodeGen/X86/movmsk.ll
index eebcf181196..41d054b0124 100644
--- a/llvm/test/CodeGen/X86/movmsk.ll
+++ b/llvm/test/CodeGen/X86/movmsk.ll
@@ -95,14 +95,11 @@ entry:
}
; PR11570
-; FIXME: This should also use movmskps; we don't form the FGETSIGN node
-; in this case, though.
define void @float_call_signbit(double %n) {
; CHECK-LABEL: float_call_signbit:
; CHECK: ## %bb.0: ## %entry
-; CHECK-NEXT: movq %xmm0, %rdi
-; CHECK-NEXT: shrq $63, %rdi
-; CHECK-NEXT: ## kill: def $edi killed $edi killed $rdi
+; CHECK-NEXT: movmskpd %xmm0, %edi
+; CHECK-NEXT: andl $1, %edi
; CHECK-NEXT: jmp _float_call_signbit_callee ## TAILCALL
entry:
%t0 = bitcast double %n to i64
diff --git a/llvm/test/CodeGen/X86/not-and-simplify.ll b/llvm/test/CodeGen/X86/not-and-simplify.ll
index 0e36a75441c..993835c66ca 100644
--- a/llvm/test/CodeGen/X86/not-and-simplify.ll
+++ b/llvm/test/CodeGen/X86/not-and-simplify.ll
@@ -20,8 +20,9 @@ define i32 @shrink_xor_constant1(i32 %x) {
define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) {
; ALL-LABEL: shrink_xor_constant1_splat:
; ALL: # %bb.0:
+; ALL-NEXT: pcmpeqd %xmm1, %xmm1
+; ALL-NEXT: pxor %xmm1, %xmm0
; ALL-NEXT: psrld $31, %xmm0
-; ALL-NEXT: pxor {{.*}}(%rip), %xmm0
; ALL-NEXT: retq
%sh = lshr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
%not = xor <4 x i32> %sh, <i32 -1, i32 -1, i32 -1, i32 -1>
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 50f7a069f02..78f6611ed46 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -1771,19 +1771,11 @@ define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
define void @PR41097() {
; SSE2-LABEL: PR41097:
; SSE2: # %bb.0:
-; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: movzwl (%rax), %eax
-; SSE2-NEXT: movd %eax, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,3,4,5,6,7]
; SSE2-NEXT: psrad $24, %xmm0
-; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; SSE2-NEXT: movdqu %xmm0, (%rax)
; SSE2-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr28504.ll b/llvm/test/CodeGen/X86/pr28504.ll
deleted file mode 100644
index a617c8aa4f1..00000000000
--- a/llvm/test/CodeGen/X86/pr28504.ll
+++ /dev/null
@@ -1,37 +0,0 @@
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
-
-; The test case is rather involved, because we need to get to a state where
-; We have a sext(setcc x, y, cc) -> (select (setcc x, y, cc), T, 0) combine,
-; BUT this combine is only triggered post-legalization, so the setcc's return
-; type is i8. So we can't have the combine opportunity be exposed too early.
-; Basically, what we want to see is that the compare result zero-extended, and
-; then stored. Only one zext, and no sexts.
-
-; CHECK-LABEL: main:
-; CHECK: movzbl (%rdi), %[[EAX:.*]]
-; CHECK-NEXT: xorl %e[[C:.]]x, %e[[C]]x
-; CHECK-NEXT: cmpl $1, %[[EAX]]
-; CHECK-NEXT: sete %[[C]]l
-; CHECK-NEXT: movl %e[[C]]x, (%rsi)
-define void @main(i8* %p, i32* %q) {
-bb:
- %tmp4 = load i8, i8* %p, align 1
- %tmp5 = sext i8 %tmp4 to i32
- %tmp6 = load i8, i8* %p, align 1
- %tmp7 = zext i8 %tmp6 to i32
- %tmp8 = sub nsw i32 %tmp5, %tmp7
- %tmp11 = icmp eq i32 %tmp7, 1
- %tmp12 = zext i1 %tmp11 to i32
- %tmp13 = add nsw i32 %tmp8, %tmp12
- %tmp14 = trunc i32 %tmp13 to i8
- %tmp15 = sext i8 %tmp14 to i16
- %tmp16 = sext i16 %tmp15 to i32
- store i32 %tmp16, i32* %q, align 4
- br i1 %tmp11, label %bb21, label %bb22
-
-bb21: ; preds = %bb
- unreachable
-
-bb22: ; preds = %bb
- ret void
-}
diff --git a/llvm/test/CodeGen/X86/pr33844.ll b/llvm/test/CodeGen/X86/pr33844.ll
deleted file mode 100644
index d933d829220..00000000000
--- a/llvm/test/CodeGen/X86/pr33844.ll
+++ /dev/null
@@ -1,38 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -o - %s | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@global = external global i32
-@global.1 = external global i64
-
-define void @patatino() {
-; CHECK-LABEL: patatino:
-; CHECK: # %bb.0: # %bb
-; CHECK-NEXT: movl {{.*}}(%rip), %eax
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: shrl $31, %ecx
-; CHECK-NEXT: addl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-NEXT: shrl $31, %ecx
-; CHECK-NEXT: andl $-2, %ecx
-; CHECK-NEXT: andl $-536870912, %eax # imm = 0xE0000000
-; CHECK-NEXT: orl %ecx, %eax
-; CHECK-NEXT: movl %eax, {{.*}}(%rip)
-; CHECK-NEXT: retq
-bb:
- %tmp = load i32, i32* @global
- %tmp1 = lshr i32 %tmp, 31
- %tmp2 = add nuw nsw i32 %tmp1, 2147483647
- %tmp3 = load i64, i64* @global.1
- %tmp4 = shl i64 %tmp3, 23
- %tmp5 = add nsw i64 %tmp4, 8388639
- %tmp6 = trunc i64 %tmp5 to i32
- %tmp7 = lshr i32 %tmp2, %tmp6
- %tmp8 = load i32, i32* @global
- %tmp9 = and i32 %tmp7, 62
- %tmp10 = and i32 %tmp8, -536870912
- %tmp11 = or i32 %tmp9, %tmp10
- store i32 %tmp11, i32* @global
- ret void
-}
diff --git a/llvm/test/CodeGen/X86/sse3.ll b/llvm/test/CodeGen/X86/sse3.ll
index 9ad4c65f7d1..b9da731837f 100644
--- a/llvm/test/CodeGen/X86/sse3.ll
+++ b/llvm/test/CodeGen/X86/sse3.ll
@@ -394,14 +394,14 @@ entry:
define <4 x i32> @t17() nounwind {
; X86-LABEL: t17:
; X86: # %bb.0: # %entry
-; X86-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X86-NEXT: andpd {{\.LCPI.*}}, %xmm0
+; X86-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; X86-NEXT: pand {{\.LCPI.*}}, %xmm0
; X86-NEXT: retl
;
; X64-LABEL: t17:
; X64: # %bb.0: # %entry
-; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0]
-; X64-NEXT: andpd {{.*}}(%rip), %xmm0
+; X64-NEXT: pshufd {{.*#+}} xmm0 = mem[0,1,0,1]
+; X64-NEXT: pand {{.*}}(%rip), %xmm0
; X64-NEXT: retq
entry:
%tmp1 = load <4 x float>, <4 x float>* undef, align 16
diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
index d8502d831fd..5d68857f6e4 100644
--- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll
@@ -5,20 +5,10 @@
define i32 @test0(<1 x i64>* %v4) nounwind {
; X32-LABEL: test0:
; X32: # %bb.0: # %entry
-; X32-NEXT: pushl %ebp
-; X32-NEXT: movl %esp, %ebp
-; X32-NEXT: andl $-8, %esp
-; X32-NEXT: subl $8, %esp
-; X32-NEXT: movl 8(%ebp), %eax
-; X32-NEXT: movl (%eax), %ecx
-; X32-NEXT: movl 4(%eax), %eax
-; X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X32-NEXT: movl %ecx, (%esp)
-; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: pshufw $238, (%eax), %mm0 # mm0 = mem[2,3,2,3]
; X32-NEXT: movd %mm0, %eax
; X32-NEXT: addl $32, %eax
-; X32-NEXT: movl %ebp, %esp
-; X32-NEXT: popl %ebp
; X32-NEXT: retl
;
; X64-LABEL: test0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index becc195e393..6bcb20e85a5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -150,25 +150,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,8,9]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
@@ -190,25 +180,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,5,6]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,10,11,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
@@ -230,25 +210,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,12,13,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
@@ -270,25 +240,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
; AVX512VL-SLOW-NEXT: retq
;
@@ -869,25 +829,15 @@ define <16 x i16> @shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_04_00_00_00_00_00_00_00_04_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
@@ -907,25 +857,15 @@ define <16 x i16> @shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_05_00_00_00_00_00_00_00_05_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
@@ -945,25 +885,15 @@ define <16 x i16> @shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_06_00_00_00_00_00_00_00_06_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
@@ -983,25 +913,15 @@ define <16 x i16> @shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT: retq
;
; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00:
; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,0,3]
-; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,0,0,0,4,5,6,7]
-; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
+; AVX512VL-SLOW-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
; AVX512VL-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
; AVX512VL-SLOW-NEXT: retq
;
@@ -4661,31 +4581,17 @@ define <16 x i16> @PR34369(<16 x i16> %vec, <16 x i16> %mask) {
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
-; AVX2-SLOW-LABEL: PR34369:
-; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,1]
-; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,6]
-; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-SLOW-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: retq
-;
-; AVX2-FAST-LABEL: PR34369:
-; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
-; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
-; AVX2-FAST-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
-; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT: vpand %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT: retq
+; AVX2-LABEL: PR34369:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,10,11,4,5,10,11,8,9,10,11,4,5,4,5]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,0,1,0,1,u,u,10,11,4,5,4,5,u,u,30,31,16,17,28,29,16,17,18,19,20,21,24,25,24,25]
+; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3],ymm0[4,5,6],ymm2[7],ymm0[8,9,10],ymm2[11],ymm0[12,13,14],ymm2[15]
+; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT: vpcmpeqw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: retq
;
; AVX512VL-LABEL: PR34369:
; AVX512VL: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/widen_arith-3.ll b/llvm/test/CodeGen/X86/widen_arith-3.ll
index 3e455f7f14c..f3b78534b20 100644
--- a/llvm/test/CodeGen/X86/widen_arith-3.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-3.ll
@@ -13,12 +13,11 @@ define void @update(<3 x i16>* %dst, <3 x i16>* %src, i32 %n) nounwind {
; CHECK-NEXT: movl %esp, %ebp
; CHECK-NEXT: andl $-8, %esp
; CHECK-NEXT: subl $32, %esp
-; CHECK-NEXT: movl {{\.LCPI.*}}, %eax
; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; CHECK-NEXT: pcmpeqd %xmm0, %xmm0
; CHECK-NEXT: movw $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT: movl $65537, {{[0-9]+}}(%esp) # imm = 0x10001
; CHECK-NEXT: movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp)
; CHECK-NEXT: jmp .LBB0_1
; CHECK-NEXT: .p2align 4, 0x90
; CHECK-NEXT: .LBB0_2: # %forbody
OpenPOWER on IntegriCloud