DAG: Undo and->or combine with FrameIndexes

This pattern shows up when lowering byval copies on AMDGPU. The byval object access is split into 4-byte chunks, adding a constant offset to the FixedStack base. When some of the offsets turn into ors, this prevents combining the constant offsets. This makes it not apparent that the object is there when matching addressing modes, so it ends up using a scratch wave offset relative access and the lengthy frame index expansion for that. llvm-svn: 309775
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-08-02 00:43:42 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2017-08-02 00:43:42 +0000
commit: acc5e82b0e33a27ba64ccb4f88c81fe8709d4576 (patch)
tree: de36eb055031183f0de7f2f65c688c27091bb987
parent: 6b898beb8e6bf5739d43c1d64646264afc595de8 (diff)
download: bcm5719-llvm-acc5e82b0e33a27ba64ccb4f88c81fe8709d4576.tar.gz
bcm5719-llvm-acc5e82b0e33a27ba64ccb4f88c81fe8709d4576.zip
4 files changed, 64 insertions, 85 deletions
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 54cddf77b4c..cf19d7b4fc0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1947,6 +1947,15 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
         return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Not);
       }
     }
+
+    // Undo the add -> or combine to merge constant offsets from a frame index.
+    if (N0.getOpcode() == ISD::OR &&
+        isa<FrameIndexSDNode>(N0.getOperand(0)) &&
+        isa<ConstantSDNode>(N0.getOperand(1)) &&
+        DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
+      SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
+      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
+    }
   }
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index 64a6b9b6b21..d6a1686d566 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -101,9 +101,8 @@ define i32 @main() nounwind ssp {
 define void @foo(i8* %fmt, ...) nounwind {
 entry:
 ; CHECK-LABEL: foo:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
 ; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #23
 ; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
 ; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
   %fmt.addr = alloca i8*, align 8
@@ -142,9 +141,8 @@ entry:
 define void @foo2(i8* %fmt, ...) nounwind {
 entry:
 ; CHECK-LABEL: foo2:
-; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
 ; CHECK: ldr {{w[0-9]+}}, [sp, #48]
-; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #23
 ; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
 ; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
   %fmt.addr = alloca i8*, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
index b53447c8ffb..0f348c6b4cb 100644
--- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll
@@ -74,46 +74,29 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
 ; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13
 
-; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
-; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
-
-; GCN-DAG: v_add_i32_e64 [[FI_ADD0:v[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 8,
-; GCN-DAG: v_or_b32_e32 [[FI_OR0:v[0-9]+]], 4, [[FI_ADD0]]
-
 ; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s5 offset:8
 ; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s5 offset:24
 
-; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:4
-; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], [[FI_OR0]], s[0:3], s4 offen offset:8
-
-; FIXME: or fails to combine with add, so FI doesn't fold and scratch wave offset is used
-; VI-DAG: v_lshrrev_b32_e64 v{{[0-9]+}}, 6
-; CI-DAG: v_lshr_b32_e64 v{{[0-9]+}}, s{{[0-9]+}}, 6
-
-; GCN-DAG: v_add_i32_e64 [[FI_ADD1:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 24,
-; GCN-DAG: v_or_b32_e32 [[FI_OR1:v[0-9]+]], 4, [[FI_ADD1]]
-
-; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
+; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s5 offset:8
+; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s5 offset:12
+; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:16
+; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:20
 
 
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
 
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s5 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s5 offset:12
-; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
+; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s5 offset:24
+; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s5 offset:28
+; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:32
+; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:36
 
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:4
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[FI_OR1]], s[0:3], s4 offen offset:8
-; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-
-
-; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s5 offset:24
-; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s5 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x800{{$}}
@@ -152,36 +135,25 @@ entry:
 
 ; GCN-DAG: s_add_u32 s32, s32, 0x800{{$}}
 
-; FIXME: Fold offset
-; GCN-DAG: v_or_b32_e32 [[OR_FI0:v[0-9]+]], 4,
-
-; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:4
-; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], [[OR_FI0]], s[0:3], s33 offen offset:8
-
-; FIXME: Fold offset
-; GCN-DAG: v_or_b32_e32 [[OR_FI1:v[0-9]+]], 4,
-
-; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:12
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32 offset:8
-
-
-; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:8
-; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:12
-; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32{{$}}
-
-
-
-; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:4
-; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], [[OR_FI1]], s[0:3], s33 offen offset:8
-; GCN: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:28
-; GCN: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:24
-
-
-; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:24
-; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:28
-; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:16
-; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8
+; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12
+; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16
+; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20
+
+; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}}
+; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4
+; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8
+; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12
+
+; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24
+; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28
+; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32
+; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36
+
+; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16
+; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20
+; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24
+; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28
 
 
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/BPF/undef.ll b/llvm/test/CodeGen/BPF/undef.ll
index 205d97c80ef..fa816546421 100644
--- a/llvm/test/CodeGen/BPF/undef.ll
+++ b/llvm/test/CodeGen/BPF/undef.ll
@@ -14,30 +14,30 @@
 
 ; Function Attrs: nounwind uwtable
 define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
-; CHECK: r1 = r10
-; CHECK: r1 += -2
-; CHECK: r2 = 0
-; CHECK: *(u16 *)(r1 + 6) = r2
-; CHECK: *(u16 *)(r1 + 4) = r2
-; CHECK: *(u16 *)(r1 + 2) = r2
 ; EL: r1 = 134678021
 ; EB: r1 = 84281096
 ; CHECK: *(u32 *)(r10 - 8) = r1
 ; EL: r1 = 2569
 ; EB: r1 = 2314
 ; CHECK: *(u16 *)(r10 - 4) = r1
-; CHECK: *(u16 *)(r10 + 24) = r2
-; CHECK: *(u16 *)(r10 + 22) = r2
-; CHECK: *(u16 *)(r10 + 20) = r2
-; CHECK: *(u16 *)(r10 + 18) = r2
-; CHECK: *(u16 *)(r10 + 16) = r2
-; CHECK: *(u16 *)(r10 + 14) = r2
-; CHECK: *(u16 *)(r10 + 12) = r2
-; CHECK: *(u16 *)(r10 + 10) = r2
-; CHECK: *(u16 *)(r10 + 8) = r2
-; CHECK: *(u16 *)(r10 + 6) = r2
-; CHECK: *(u16 *)(r10 - 2) = r2
-; CHECK: *(u16 *)(r10 + 26) = r2
+
+; CHECK: r1 = 0
+; CHECK: *(u16 *)(r10 + 24) = r1
+; CHECK: *(u16 *)(r10 + 22) = r1
+; CHECK: *(u16 *)(r10 + 20) = r1
+; CHECK: *(u16 *)(r10 + 18) = r1
+; CHECK: *(u16 *)(r10 + 16) = r1
+; CHECK: *(u16 *)(r10 + 14) = r1
+; CHECK: *(u16 *)(r10 + 12) = r1
+; CHECK: *(u16 *)(r10 + 10) = r1
+; CHECK: *(u16 *)(r10 + 8) = r1
+; CHECK: *(u16 *)(r10 + 6) = r1
+; CHECK: *(u16 *)(r10 + 4) = r1
+; CHECK: *(u16 *)(r10 + 2) = r1
+; CHECK: *(u16 *)(r10 + 0) = r1
+; CHECK: *(u16 *)(r10 - 2) = r1
+; CHECK: *(u16 *)(r10 + 26) = r1
+
 ; CHECK: r2 = r10
 ; CHECK: r2 += -8
 ; CHECK: r1 = <MCOperand Expr:(routing)>ll
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-08-02 00:43:42 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2017-08-02 00:43:42 +0000
commit	acc5e82b0e33a27ba64ccb4f88c81fe8709d4576 (patch)
tree	de36eb055031183f0de7f2f65c688c27091bb987
parent	6b898beb8e6bf5739d43c1d64646264afc595de8 (diff)
download	bcm5719-llvm-acc5e82b0e33a27ba64ccb4f88c81fe8709d4576.tar.gz bcm5719-llvm-acc5e82b0e33a27ba64ccb4f88c81fe8709d4576.zip