summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaits.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp19
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir89
-rw-r--r--llvm/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir177
4 files changed, 288 insertions, 7 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
index 6c4a2a4d210..a9e693917bf 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -178,8 +178,10 @@ FunctionPass *llvm::createSIInsertWaitsPass() {
const Counters SIInsertWaits::ZeroCounts = { { 0, 0, 0 } };
-static bool readsVCCZ(unsigned Opcode) {
- return Opcode == AMDGPU::S_CBRANCH_VCCNZ || Opcode == AMDGPU::S_CBRANCH_VCCZ;
+static bool readsVCCZ(const MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+ !MI.getOperand(1).isUndef();
}
bool SIInsertWaits::hasOutstandingLGKM() const {
@@ -574,7 +576,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
}
// Check if we need to apply the bug work-around
- if (readsVCCZ(I->getOpcode()) && VCCZCorrupt) {
+ if (VCCZCorrupt && readsVCCZ(*I)) {
DEBUG(dbgs() << "Inserting vccz bug work-around before: " << *I << '\n');
// Wait on everything, not just LGKM. vccz reads usually come from
@@ -589,7 +591,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
// vcc and then writing it back to the register.
BuildMI(MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
+ .addReg(AMDGPU::VCC);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 94b484ed0b3..02cbc882bf8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1196,6 +1196,7 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB,
MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
Cond.push_back(MachineOperand::CreateImm(Pred));
+ Cond.push_back(I->getOperand(1)); // Save the branch register.
++I;
@@ -1298,9 +1299,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
if (!FBB) {
- BuildMI(&MBB, DL, get(Opcode))
+ Cond[1].isUndef();
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
+ // Copy the flags onto the implicit condition register operand.
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
if (BytesAdded)
*BytesAdded = 4;
return 1;
@@ -1308,11 +1316,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
assert(TBB && FBB);
- BuildMI(&MBB, DL, get(Opcode))
+ MachineInstr *CondBr =
+ BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
.addMBB(FBB);
+ MachineOperand &CondReg = CondBr->getOperand(1);
+ CondReg.setIsUndef(Cond[1].isUndef());
+ CondReg.setIsKill(Cond[1].isKill());
+
if (BytesAdded)
*BytesAdded = 8;
@@ -1321,7 +1334,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
bool SIInstrInfo::reverseBranchCondition(
SmallVectorImpl<MachineOperand> &Cond) const {
- assert(Cond.size() == 1);
+ assert(Cond.size() == 2);
Cond[0].setImm(-Cond[0].getImm());
return false;
}
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir b/llvm/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir
new file mode 100644
index 00000000000..66182d09289
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/invert-br-undef-vcc.mir
@@ -0,0 +1,89 @@
+# RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
+--- |
+
+ define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+ entry:
+ br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ else: ; preds = %entry
+ store volatile i32 100, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ if: ; preds = %entry
+ store volatile i32 9, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %if, %else
+ %value = phi i32 [ 0, %if ], [ 1, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ attributes #0 = { nounwind }
+
+ !0 = !{}
+
+...
+---
+# CHECK-LABEL: name: invert_br_undef_vcc
+# CHECK: S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc
+
+name: invert_br_undef_vcc
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ successors: %bb.2.if, %bb.1.else
+ liveins: %sgpr0_sgpr1
+
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ S_CBRANCH_VCCNZ %bb.2.if, implicit undef %vcc
+
+ bb.1.else:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.2.if:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+
+ bb.3.done:
+ liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %sgpr3 = S_MOV_B32 61440
+ %sgpr2 = S_MOV_B32 -1
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir b/llvm/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir
new file mode 100644
index 00000000000..03e473e3a0c
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -0,0 +1,177 @@
+# RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
+--- |
+
+ define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+ entry:
+ %cmp0 = fcmp oeq float %cond, 0.000000e+00
+ br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ else: ; preds = %entry
+ store volatile i32 100, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ if: ; preds = %entry
+ store volatile i32 9, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %if, %else
+ %value = phi i32 [ 0, %if ], [ 1, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+ entry:
+ br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
+
+ else: ; preds = %entry
+ store volatile i32 100, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ if: ; preds = %entry
+ store volatile i32 9, i32 addrspace(1)* undef
+ br label %done, !structurizecfg.uniform !0
+
+ done: ; preds = %if, %else
+ %value = phi i32 [ 0, %if ], [ 1, %else ]
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+ }
+
+ attributes #0 = { nounwind }
+ attributes #1 = { readnone }
+
+ !0 = !{}
+
+...
+---
+# CHECK-LABEL: name: vccz_corrupt_workaround
+# CHECK: %vcc = V_CMP_EQ_F32
+# CHECK-NEXT: %vcc = S_MOV_B64 %vcc
+# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit killed %vcc
+
+name: vccz_corrupt_workaround
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ successors: %bb.2.if, %bb.1.else
+ liveins: %sgpr0_sgpr1
+
+ %sgpr2 = S_LOAD_DWORD_IMM %sgpr0_sgpr1, 9, 0 :: (non-temporal dereferenceable invariant load 4 from `float addrspace(2)* undef`)
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ %vcc = V_CMP_EQ_F32_e64 0, 0, 0, %sgpr2, 0, 0, implicit %exec
+ S_CBRANCH_VCCZ %bb.1.else, implicit killed %vcc
+
+ bb.2.if:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.1.else:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+
+ bb.3.done:
+ liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %sgpr3 = S_MOV_B32 61440
+ %sgpr2 = S_MOV_B32 -1
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
+---
+# CHECK-LABEL: name: vccz_corrupt_undef_vcc
+# CHECK: S_WAITCNT
+# CHECK-NEXT: S_CBRANCH_VCCZ %bb.2.else, implicit undef %vcc
+
+name: vccz_corrupt_undef_vcc
+alignment: 0
+exposesReturnsTwice: false
+legalized: false
+regBankSelected: false
+selected: false
+tracksRegLiveness: true
+liveins:
+ - { reg: '%sgpr0_sgpr1' }
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 0
+ adjustsStack: false
+ hasCalls: false
+ maxCallFrameSize: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+body: |
+ bb.0.entry:
+ successors: %bb.2.if, %bb.1.else
+ liveins: %sgpr0_sgpr1
+
+ %sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM killed %sgpr0_sgpr1, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+ %sgpr7 = S_MOV_B32 61440
+ %sgpr6 = S_MOV_B32 -1
+ S_CBRANCH_VCCZ %bb.1.else, implicit undef %vcc
+
+ bb.2.if:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 9, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 0, implicit %exec
+ S_BRANCH %bb.3.done
+
+ bb.1.else:
+ successors: %bb.3.done
+ liveins: %sgpr6, %sgpr7, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %vgpr0 = V_MOV_B32_e32 100, implicit %exec
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `i32 addrspace(1)* undef`)
+ %vgpr0 = V_MOV_B32_e32 1, implicit %exec
+
+ bb.3.done:
+ liveins: %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3:0x00000003
+
+ %sgpr3 = S_MOV_B32 61440
+ %sgpr2 = S_MOV_B32 -1
+ BUFFER_STORE_DWORD_OFFSET killed %vgpr0, killed %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+ S_ENDPGM
+
+...
OpenPOWER on IntegriCloud