summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2016-03-23 21:49:25 +0000
committerMatt Arsenault <Matthew.Arsenault@amd.com>2016-03-23 21:49:25 +0000
commitf43c2a0b4967626c7255940a7317ec4c8c9e9319 (patch)
treef27a5a2c8fa22e8ff0e614efbe33c98c748f3c26
parent94710840fb2e1a16a75124593314b60bf13a0a3a (diff)
downloadbcm5719-llvm-f43c2a0b4967626c7255940a7317ec4c8c9e9319.tar.gz
bcm5719-llvm-f43c2a0b4967626c7255940a7317ec4c8c9e9319.zip
AMDGPU: Insert moves of frame index to value operands
Strengthen tests of storing frame indices. Right now this just creates irrelevant scheduling changes. We don't want to have multiple frame index operands on an instruction. There seem to be various assumptions that at least the same frame index will not appear twice in the LocalStackSlotAllocation pass. There's no reason to have this happen, and it just makes it easy to introduce bugs where the immediate offset is appplied to the storing instruction when it should really be applied to the value being stored as a separate add. This might not be sufficient. It might still be problematic to have an add fi, fi situation, but that's even less unlikely to happen in real code. llvm-svn: 264200
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp56
-rw-r--r--llvm/test/CodeGen/AMDGPU/captured-frame-index.ll119
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll35
3 files changed, 204 insertions, 6 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 1f442bc0779..156423500af 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
@@ -1557,6 +1558,61 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
void AMDGPUDAGToDAGISel::PreprocessISelDAG() {
bool Modified = false;
+ MachineFrameInfo *MFI = CurDAG->getMachineFunction().getFrameInfo();
+
+ // Handle the perverse case where a frame index is being stored. We don't
+ // want to see multiple frame index operands on the same instruction since
+ // it complicates things and violates some assumptions about frame index
+ // lowering.
+ for (int I = MFI->getObjectIndexBegin(), E = MFI->getObjectIndexEnd();
+ I != E; ++I) {
+ SDValue FI = CurDAG->getTargetFrameIndex(I, MVT::i32);
+
+ // It's possible that we have a frame index defined in the function that
+ // isn't used in this block.
+ if (FI.use_empty())
+ continue;
+
+ // Skip over the AssertZext inserted during lowering.
+ SDValue EffectiveFI = FI;
+ auto It = FI->use_begin();
+ if (It->getOpcode() == ISD::AssertZext && FI->hasOneUse()) {
+ EffectiveFI = SDValue(*It, 0);
+ It = EffectiveFI->use_begin();
+ }
+
+ for (auto It = EffectiveFI->use_begin(); !It.atEnd(); ) {
+ SDUse &Use = It.getUse();
+ SDNode *User = Use.getUser();
+ unsigned OpIdx = It.getOperandNo();
+ ++It;
+
+ if (MemSDNode *M = dyn_cast<MemSDNode>(User)) {
+ unsigned PtrIdx = M->getOpcode() == ISD::STORE ? 2 : 1;
+ if (OpIdx == PtrIdx)
+ continue;
+
+ unsigned OpN = OpN = M->getNumOperands();
+ SDValue NewOps[8];
+
+ assert(OpN < array_lengthof(NewOps));
+ for (unsigned Op = 0; Op != OpN; ++Op) {
+ if (Op != OpIdx) {
+ NewOps[Op] = M->getOperand(Op);
+ continue;
+ }
+
+ MachineSDNode *Mov = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
+ SDLoc(M), MVT::i32, FI);
+ NewOps[Op] = SDValue(Mov, 0);
+ }
+
+ CurDAG->UpdateNodeOperands(M, makeArrayRef(NewOps, OpN));
+ Modified = true;
+ }
+ }
+ }
+
// XXX - Other targets seem to be able to do this without a worklist.
SmallVector<LoadSDNode *, 8> LoadsToReplace;
SmallVector<StoreSDNode *, 8> StoresToReplace;
diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
new file mode 100644
index 00000000000..6b70c931feb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -0,0 +1,119 @@
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}stored_fi_to_lds:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+
+; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+
+; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
+define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+ %tmp = alloca float
+ store float 4.0, float *%tmp
+ store float* %tmp, float* addrspace(3)* %ptr
+ ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
+; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO1:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[ZERO1]]
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, [[FI1]]
+
+
+; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
+; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]]
+define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ store float 4.0, float *%tmp0
+ store float 4.0, float *%tmp1
+ store volatile float* %tmp0, float* addrspace(3)* %ptr
+ store volatile float* %tmp1, float* addrspace(3)* %ptr
+ ret void
+}
+
+; Same frame index is used multiple times in the store
+; GCN-LABEL: {{^}}stored_fi_to_self:
+define void @stored_fi_to_self() #0 {
+ %tmp = alloca i32*
+
+ ; Avoid optimizing everything out
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+ %bitcast = bitcast i32** %tmp to i32*
+ store volatile i32* %bitcast, i32** %tmp
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_fi:
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+; GCN: buffer_store_dword
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI1]], [[FI2]]
+
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], [[FI1]]
+define void @stored_fi_to_fi() #0 {
+ %tmp0 = alloca i32*
+ %tmp1 = alloca i32*
+ %tmp2 = alloca i32*
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
+ store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
+ store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
+
+ %bitcast1 = bitcast i32** %tmp1 to i32*
+ %bitcast2 = bitcast i32** %tmp2 to i32* ; at offset 8
+
+ store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
+ store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
+ ret void
+}
+
+; GCN-LABEL: {{^}}stored_fi_to_global:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[FI]]
+define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+ %tmp = alloca float
+ store float 0.0, float *%tmp
+ store float* %tmp, float* addrspace(1)* %ptr
+ ret void
+}
+
+; Offset is applied
+; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
+
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[FI1]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI2]], s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ %tmp2 = alloca float
+ store volatile float 0.0, float *%tmp0
+ store volatile float 0.0, float *%tmp1
+ store volatile float 0.0, float *%tmp2
+ store volatile float* %tmp1, float* addrspace(1)* %ptr
+ store volatile float* %tmp2, float* addrspace(1)* %ptr
+ ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 2ee98cc3d2d..307eca712cc 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; Pointer value is stored in a candidate for LDS usage.
@@ -11,6 +12,18 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
ret void
}
+; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
+; GCN: buffer_store_dword v
+define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+ %tmp0 = alloca float
+ %tmp1 = alloca float
+ store float 0.0, float *%tmp0
+ store float 0.0, float *%tmp1
+ store volatile float* %tmp0, float* addrspace(1)* %ptr
+ store volatile float* %tmp1, float* addrspace(1)* %ptr
+ ret void
+}
+
; GCN-LABEL: {{^}}stored_lds_pointer_value_gep:
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
@@ -36,17 +49,27 @@ bb:
define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
entry:
%tmp0 = alloca [4 x i32]
- %x = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
- %y = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
- %z = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
- %w = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
+ %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
+ %y = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 1
+ %z = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 2
+ %w = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 3
store i32 0, i32* %x
store i32 1, i32* %y
store i32 2, i32* %z
store i32 3, i32* %w
- %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
+ %tmp1 = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 %index
store i32* %tmp1, i32* addrspace(1)* %out
ret void
}
+; GCN-LABEL: {{^}}stored_fi_to_self:
+; GCN-NOT: ds_
+define void @stored_fi_to_self() #0 {
+ %tmp = alloca i32*
+ store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
+ %bitcast = bitcast i32** %tmp to i32*
+ store volatile i32* %bitcast, i32** %tmp
+ ret void
+}
+
attributes #0 = { nounwind }
OpenPOWER on IntegriCloud