summaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
diff options
context:
space:
mode:
authorJay Foad <jay.foad@gmail.com>2019-07-05 14:52:48 +0000
committerJay Foad <jay.foad@gmail.com>2019-07-05 14:52:48 +0000
commit7e0c10b55ff781906b1ac81254ce705b00264aee (patch)
tree4151369df21c9057aacd4bf7a20a7ad7aeda7af5 /llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
parentd14003d99f3d59b62bb6ff5a1a5d691f47d2b824 (diff)
downloadbcm5719-llvm-7e0c10b55ff781906b1ac81254ce705b00264aee.tar.gz
bcm5719-llvm-7e0c10b55ff781906b1ac81254ce705b00264aee.zip
[AMDGPU] DPP combiner: recognize identities for more opcodes
Summary: This allows the DPP combiner to kick in more often. For example the exclusive scan generated by the atomic optimizer for a divergent atomic add used to look like this: v_mov_b32_e32 v3, v1 v_mov_b32_e32 v5, v1 v_mov_b32_e32 v6, v1 v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf s_nop 1 v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf v_mov_b32_dpp v6, v3 row_shr:3 row_mask:0xf bank_mask:0xf v_add3_u32 v3, v4, v5, v6 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_shr:4 row_mask:0xf bank_mask:0xe v_add_u32_e32 v3, v3, v4 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_shr:8 row_mask:0xf bank_mask:0xc v_add_u32_e32 v3, v3, v4 v_mov_b32_e32 v4, v1 s_nop 1 v_mov_b32_dpp v4, v3 row_bcast:15 row_mask:0xa bank_mask:0xf v_add_u32_e32 v3, v3, v4 s_nop 1 v_mov_b32_dpp v1, v3 row_bcast:31 row_mask:0xc bank_mask:0xf v_add_u32_e32 v1, v3, v1 v_add_u32_e32 v1, v2, v1 v_readlane_b32 s0, v1, 63 But now most of the dpp movs are combined into adds: v_mov_b32_e32 v3, v1 v_mov_b32_e32 v5, v1 s_nop 0 v_mov_b32_dpp v3, v2 wave_shr:1 row_mask:0xf bank_mask:0xf s_nop 1 v_add_u32_dpp v4, v3, v3 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf v_mov_b32_dpp v1, v3 row_shr:3 row_mask:0xf bank_mask:0xf v_add3_u32 v1, v4, v5, v1 s_nop 1 v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xe s_nop 1 v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xc s_nop 1 v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf s_nop 1 v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf v_add_u32_e32 v1, v2, v1 v_readlane_b32 s0, v1, 63 Reviewers: arsenm, vpykhtin Subscribers: kzhuravl, nemanjai, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, kbarton, MaskRay, jfb, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D64207 llvm-svn: 365211
Diffstat (limited to 'llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp13
1 files changed, 13 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 536dc54a65c..7348b5b56c8 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -253,33 +253,46 @@ static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
switch (OrigMIOp) {
default: break;
case AMDGPU::V_ADD_U32_e32:
+ case AMDGPU::V_ADD_U32_e64:
case AMDGPU::V_ADD_I32_e32:
+ case AMDGPU::V_ADD_I32_e64:
case AMDGPU::V_OR_B32_e32:
+ case AMDGPU::V_OR_B32_e64:
case AMDGPU::V_SUBREV_U32_e32:
+ case AMDGPU::V_SUBREV_U32_e64:
case AMDGPU::V_SUBREV_I32_e32:
+ case AMDGPU::V_SUBREV_I32_e64:
case AMDGPU::V_MAX_U32_e32:
+ case AMDGPU::V_MAX_U32_e64:
case AMDGPU::V_XOR_B32_e32:
+ case AMDGPU::V_XOR_B32_e64:
if (OldOpnd->getImm() == 0)
return true;
break;
case AMDGPU::V_AND_B32_e32:
+ case AMDGPU::V_AND_B32_e64:
case AMDGPU::V_MIN_U32_e32:
+ case AMDGPU::V_MIN_U32_e64:
if (static_cast<uint32_t>(OldOpnd->getImm()) ==
std::numeric_limits<uint32_t>::max())
return true;
break;
case AMDGPU::V_MIN_I32_e32:
+ case AMDGPU::V_MIN_I32_e64:
if (static_cast<int32_t>(OldOpnd->getImm()) ==
std::numeric_limits<int32_t>::max())
return true;
break;
case AMDGPU::V_MAX_I32_e32:
+ case AMDGPU::V_MAX_I32_e64:
if (static_cast<int32_t>(OldOpnd->getImm()) ==
std::numeric_limits<int32_t>::min())
return true;
break;
case AMDGPU::V_MUL_I32_I24_e32:
+ case AMDGPU::V_MUL_I32_I24_e64:
case AMDGPU::V_MUL_U32_U24_e32:
+ case AMDGPU::V_MUL_U32_U24_e64:
if (OldOpnd->getImm() == 1)
return true;
break;
OpenPOWER on IntegriCloud