summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorMatt Arsenault <Matthew.Arsenault@amd.com>2019-11-02 09:30:59 -0700
committerMatt Arsenault <arsenm2@gmail.com>2020-01-06 18:26:42 -0500
commit4e85ca9562a588eba491e44bcbf73cb2f419780f (patch)
treede5edd8f24576b0260e48b4b5e1fea08b2d18d6a /llvm/lib
parent26f714ff43e3498ae2528ad8c9875de77a529472 (diff)
downloadbcm5719-llvm-4e85ca9562a588eba491e44bcbf73cb2f419780f.tar.gz
bcm5719-llvm-4e85ca9562a588eba491e44bcbf73cb2f419780f.zip
AMDGPU/GlobalISel: Replace handling of boolean values
This solves selection failures with generated selection patterns, which would fail due to inferring the SGPR reg bank for virtual registers with a set register class instead of VCC bank. Use instruction selection would constrain the virtual register to a specific class, so when the def was selected later the bank no longer was set to VCC. Remove the SCC reg bank. SCC isn't directly addressable, so it requires copying from SCC to an allocatable 32-bit register during selection, so these might as well be treated as 32-bit SGPR values. Now any scalar boolean value that will produce an outupt in SCC should be widened during RegBankSelect to s32. Any s1 value should be a vector boolean during selection. This makes the vcc register bank unambiguous with a normal SGPR during selection. Summary of how this should now work: - G_TRUNC is always a no-op, and never should use a vcc bank result. - SALU boolean operations should be promoted to s32 in RegBankSelect apply mapping - An s1 value means vcc bank at selection. The exception is for legalization artifacts that use s1, which are never VCC. All other contexts should infer the VCC register classes for s1 typed registers. The LLT for the register is now needed to infer the correct register class. Extensions with vcc sources should be legalized to a select of constants during RegBankSelect. - Copy from non-vcc to vcc ensures high bits of the input value are cleared during selection. - SALU boolean inputs should ensure the inputs are 0/1. This includes select, conditional branches, and carry-ins. There are a few somewhat dirty details. One is that G_TRUNC/G_*EXT selection ignores the usual register-bank from register class functions, and can't handle truncates with VCC result banks. I think this is OK, since the artifacts are specially treated anyway. This does require some care to avoid producing cases with vcc. There will also be no 100% reliable way to verify this rule is followed in selection in case of register classes, and violations manifests themselves as invalid copy instructions much later. Standard phi handling also only considers the bank of the result register, and doesn't insert copies to make the source banks match. This doesn't work for vcc, so we have to manually correct phi inputs in this case. We should add a verifier check to make sure there are no phis with mixed vcc and non-vcc register bank inputs. There's also some duplication with the LegalizerHelper, and some code which should live in the helper. I don't see a good way to share special knowledge about what types to use for intermediate operations depending on the bank for example. Using the helper to replace extensions with selects also seems somewhat awkward to me. Another issue is there are some contexts calling getRegBankFromRegClass that apparently don't have the LLT type for the register, but I haven't yet run into a real issue from this. This also introduces new unnecessary instructions in most cases, since we don't yet try to optimize out the zext when the source is known to come from a compare.
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def110
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp137
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp386
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td2
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp8
8 files changed, 427 insertions, 249 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index ae87cf08275..2e92ae51660 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -15,35 +15,34 @@ namespace AMDGPU {
enum PartialMappingIdx {
None = - 1,
- PM_SGPR1 = 2,
- PM_SGPR16 = 6,
- PM_SGPR32 = 7,
- PM_SGPR64 = 8,
- PM_SGPR128 = 9,
- PM_SGPR256 = 10,
- PM_SGPR512 = 11,
- PM_SGPR1024 = 12,
- PM_VGPR1 = 13,
- PM_VGPR16 = 17,
- PM_VGPR32 = 18,
- PM_VGPR64 = 19,
- PM_VGPR128 = 20,
- PM_VGPR256 = 21,
- PM_VGPR512 = 22,
- PM_VGPR1024 = 23,
- PM_SGPR96 = 24,
- PM_VGPR96 = 25,
- PM_AGPR96 = 26,
- PM_AGPR32 = 32,
- PM_AGPR64 = 33,
- PM_AGPR128 = 34,
- PM_AGPR512 = 36,
- PM_AGPR1024 = 37
+ PM_SGPR1 = 1,
+ PM_SGPR16 = 5,
+ PM_SGPR32 = 6,
+ PM_SGPR64 = 7,
+ PM_SGPR128 = 8,
+ PM_SGPR256 = 9,
+ PM_SGPR512 = 10,
+ PM_SGPR1024 = 11,
+ PM_VGPR1 = 12,
+ PM_VGPR16 = 16,
+ PM_VGPR32 = 17,
+ PM_VGPR64 = 18,
+ PM_VGPR128 = 19,
+ PM_VGPR256 = 20,
+ PM_VGPR512 = 21,
+ PM_VGPR1024 = 22,
+ PM_SGPR96 = 23,
+ PM_VGPR96 = 24,
+ PM_AGPR96 = 25,
+ PM_AGPR32 = 31,
+ PM_AGPR64 = 32,
+ PM_AGPR128 = 33,
+ PM_AGPR512 = 34,
+ PM_AGPR1024 = 35
};
const RegisterBankInfo::PartialMapping PartMappings[] {
// StartIdx, Length, RegBank
- {0, 1, SCCRegBank},
{0, 1, VCCRegBank},
{0, 1, SGPRRegBank}, // SGPR begin
@@ -75,40 +74,37 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
};
const RegisterBankInfo::ValueMapping ValMappings[] {
- // SCC
- {&PartMappings[0], 1},
-
// VCC
- {&PartMappings[1], 1},
+ {&PartMappings[0], 1},
// SGPRs
- {&PartMappings[2], 1}, // 1
+ {&PartMappings[1], 1}, // 1
{nullptr, 0}, // Illegal power of 2 sizes
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[3], 1}, // 16
- {&PartMappings[4], 1}, // 32
- {&PartMappings[5], 1}, // 64
- {&PartMappings[6], 1}, // 128
- {&PartMappings[7], 1}, // 256
- {&PartMappings[8], 1}, // 512
- {&PartMappings[9], 1}, // 1024
+ {&PartMappings[2], 1}, // 16
+ {&PartMappings[3], 1}, // 32
+ {&PartMappings[4], 1}, // 64
+ {&PartMappings[5], 1}, // 128
+ {&PartMappings[6], 1}, // 256
+ {&PartMappings[7], 1}, // 512
+ {&PartMappings[8], 1}, // 1024
// VGPRs
- {&PartMappings[10], 1}, // 1
+ {&PartMappings[9], 1}, // 1
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[11], 1}, // 16
- {&PartMappings[12], 1}, // 32
- {&PartMappings[13], 1}, // 64
- {&PartMappings[14], 1}, // 128
- {&PartMappings[15], 1}, // 256
- {&PartMappings[16], 1}, // 512
- {&PartMappings[17], 1}, // 1024
+ {&PartMappings[10], 1}, // 16
+ {&PartMappings[11], 1}, // 32
+ {&PartMappings[12], 1}, // 64
+ {&PartMappings[13], 1}, // 128
+ {&PartMappings[14], 1}, // 256
+ {&PartMappings[15], 1}, // 512
+ {&PartMappings[16], 1}, // 1024
+ {&PartMappings[17], 1},
{&PartMappings[18], 1},
{&PartMappings[19], 1},
- {&PartMappings[20], 1},
// AGPRs
{nullptr, 0},
@@ -116,12 +112,12 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[21], 1}, // 32
- {&PartMappings[22], 1}, // 64
- {&PartMappings[23], 1}, // 128
+ {&PartMappings[20], 1}, // 32
+ {&PartMappings[21], 1}, // 64
+ {&PartMappings[22], 1}, // 128
{nullptr, 0},
- {&PartMappings[24], 1}, // 512
- {&PartMappings[25], 1} // 1024
+ {&PartMappings[23], 1}, // 512
+ {&PartMappings[24], 1} // 1024
};
const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
@@ -147,10 +143,9 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
};
enum ValueMappingIdx {
- SCCStartIdx = 0,
- SGPRStartIdx = 2,
- VGPRStartIdx = 13,
- AGPRStartIdx = 27
+ SGPRStartIdx = 1,
+ VGPRStartIdx = 12,
+ AGPRStartIdx = 26
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
@@ -158,12 +153,9 @@ const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
unsigned Idx;
switch (Size) {
case 1:
- if (BankID == AMDGPU::SCCRegBankID)
- return &ValMappings[0];
if (BankID == AMDGPU::VCCRegBankID)
- return &ValMappings[1];
+ return &ValMappings[0];
- // 1-bit values not from a compare etc.
Idx = BankID == AMDGPU::SGPRRegBankID ? PM_SGPR1 : PM_VGPR1;
break;
case 96:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index aadec1a005b..b595facd9f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -70,28 +70,6 @@ void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
InstructionSelector::setupMF(MF, KB, CoverageInfo);
}
-static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
- if (Register::isPhysicalRegister(Reg))
- return Reg == AMDGPU::SCC;
-
- auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
- const TargetRegisterClass *RC =
- RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
- if (RC) {
- // FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
- // context of the register bank has been lost.
- // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which
- // won't ever beconstrained any further.
- if (RC != &AMDGPU::SGPR_32RegClass)
- return false;
- const LLT Ty = MRI.getType(Reg);
- return Ty.isValid() && Ty.getSizeInBits() == 1;
- }
-
- const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
- return RB->getID() == AMDGPU::SCCRegBankID;
-}
-
bool AMDGPUInstructionSelector::isVCC(Register Reg,
const MachineRegisterInfo &MRI) const {
if (Register::isPhysicalRegister(Reg))
@@ -134,12 +112,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
return false;
+ const TargetRegisterClass *SrcRC
+ = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+
+ Register MaskedReg = MRI->createVirtualRegister(SrcRC);
+
+ // We can't trust the high bits at this point, so clear them.
+
+ // TODO: Skip masking high bits if def is known boolean.
+
+ unsigned AndOpc = TRI.isSGPRClass(SrcRC) ?
+ AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ .addImm(1)
+ .addReg(SrcReg);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
.addImm(0)
- .addReg(SrcReg);
+ .addReg(MaskedReg);
if (!MRI->getRegClassOrNull(SrcReg))
- MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI));
+ MRI->setRegClass(SrcReg, SrcRC);
I.eraseFromParent();
return true;
}
@@ -196,11 +188,6 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
- if (RB.getID() == AMDGPU::SCCRegBankID) {
- LLVM_DEBUG(dbgs() << "illegal scc phi\n");
- return false;
- }
-
DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
@@ -208,6 +195,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
}
+ // TODO: Verify that all registers have the same bank
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
@@ -407,7 +395,7 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const {
Register Dst1Reg = I.getOperand(1).getReg();
const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO;
- if (!isSCC(Dst1Reg, *MRI)) {
+ if (isVCC(Dst1Reg, *MRI)) {
// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
// carry out despite the _i32 name. These were renamed in VI to _U32.
// FIXME: We should probably rename the opcodes here.
@@ -742,7 +730,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
Register CCReg = I.getOperand(0).getReg();
- if (isSCC(CCReg, *MRI)) {
+ if (!isVCC(CCReg, *MRI)) {
int Opcode = getS_CMPOpcode(Pred, Size);
if (Opcode == -1)
return false;
@@ -1085,7 +1073,7 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
assert(Size <= 32 || Size == 64);
const MachineOperand &CCOp = I.getOperand(1);
Register CCReg = CCOp.getReg();
- if (isSCC(CCReg, *MRI)) {
+ if (!isVCC(CCReg, *MRI)) {
unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
AMDGPU::S_CSELECT_B32;
MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -1157,10 +1145,19 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
if (!DstTy.isScalar())
return false;
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const LLT S1 = LLT::scalar(1);
+
const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
- if (SrcRB != DstRB)
- return false;
+ const RegisterBank *DstRB;
+ if (DstTy == S1) {
+ // This is a special case. We don't treat s1 for legalization artifacts as
+ // vcc booleans.
+ DstRB = SrcRB;
+ } else {
+ DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ if (SrcRB != DstRB)
+ return false;
+ }
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
@@ -1201,6 +1198,20 @@ static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
return SignedMask >= -16 && SignedMask <= 64;
}
+// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
+const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
+ Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+ if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
+ return RB;
+
+ // Ignore the type, since we don't use vcc in artifacts.
+ if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
+ return &RBI.getRegBankFromRegClass(*RC, LLT());
+ return nullptr;
+}
+
bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
const DebugLoc &DL = I.getDebugLoc();
@@ -1210,57 +1221,17 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
const LLT DstTy = MRI->getType(DstReg);
const LLT SrcTy = MRI->getType(SrcReg);
- const LLT S1 = LLT::scalar(1);
const unsigned SrcSize = SrcTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
if (!DstTy.isScalar())
return false;
- const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
-
- if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
- if (SrcTy != S1 || DstSize > 64) // Invalid
- return false;
-
- unsigned Opcode =
- DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
- const TargetRegisterClass *DstRC =
- DstSize > 32 ? &AMDGPU::SReg_64RegClass : &AMDGPU::SReg_32RegClass;
-
- // FIXME: Create an extra copy to avoid incorrectly constraining the result
- // of the scc producer.
- Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
- BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
- .addReg(SrcReg);
- BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
- .addReg(TmpReg);
-
- // The instruction operands are backwards from what you would expect.
- BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
- .addImm(0)
- .addImm(Signed ? -1 : 1);
- I.eraseFromParent();
- return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
- }
-
- if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
- if (SrcTy != S1) // Invalid
- return false;
-
- MachineInstr *ExtI =
- BuildMI(MBB, I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
- .addImm(0) // src0_modifiers
- .addImm(0) // src0
- .addImm(0) // src1_modifiers
- .addImm(Signed ? -1 : 1) // src1
- .addUse(SrcReg);
- I.eraseFromParent();
- return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
- }
-
if (I.getOpcode() == AMDGPU::G_ANYEXT)
return selectCOPY(I);
+ // Artifact casts should never use vcc.
+ const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
+
if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
// 64-bit should have been split up in RegBankSelect
@@ -1512,12 +1483,15 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
// GlobalISel, we should push that decision into RegBankSelect. Assume for now
// RegBankSelect knows what it's doing if the branch condition is scc, even
// though it currently does not.
- if (isSCC(CondReg, *MRI)) {
+ if (!isVCC(CondReg, *MRI)) {
+ if (MRI->getType(CondReg) != LLT::scalar(32))
+ return false;
+
CondPhysReg = AMDGPU::SCC;
BrOpcode = AMDGPU::S_CBRANCH_SCC1;
// FIXME: Hack for isSCC tests
ConstrainRC = &AMDGPU::SGPR_32RegClass;
- } else if (isVCC(CondReg, *MRI)) {
+ } else {
// FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
// We sort of know that a VCC producer based on the register bank, that ands
// inactive lanes with 0. What if there was a logical operation with vcc
@@ -1526,8 +1500,7 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
CondPhysReg = TRI.getVCC();
BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
ConstrainRC = TRI.getBoolRC();
- } else
- return false;
+ }
if (!MRI->getRegClassOrNull(CondReg))
MRI->setRegClass(CondReg, ConstrainRC);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index ae6b895d8e4..5a48ad80743 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -38,6 +38,7 @@ class MachineInstr;
class MachineIRBuilder;
class MachineOperand;
class MachineRegisterInfo;
+class RegisterBank;
class SIInstrInfo;
class SIMachineFunctionInfo;
class SIRegisterInfo;
@@ -69,6 +70,10 @@ private:
bool isInstrUniform(const MachineInstr &MI) const;
bool isVCC(Register Reg, const MachineRegisterInfo &MRI) const;
+ const RegisterBank *getArtifactRegBank(
+ Register Reg, const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
/// tblgen-erated 'select' implementation.
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 64f058a1fba..dc9d3744b4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -244,7 +244,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
S32, S64, S16, V2S16
};
- setAction({G_BRCOND, S1}, Legal);
+ setAction({G_BRCOND, S1}, Legal); // VCC branches
+ setAction({G_BRCOND, S32}, Legal); // SCC branches
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
@@ -296,7 +297,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder({G_UADDO, G_USUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
- .legalFor({{S32, S1}})
+ .legalFor({{S32, S1}, {S32, S32}})
.clampScalar(0, S32, S32)
.scalarize(0); // TODO: Implement.
@@ -505,9 +506,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
auto &CmpBuilder =
getActionDefinitionsBuilder(G_ICMP)
+ // The compare output type differs based on the register bank of the output,
+ // so make both s1 and s32 legal.
+ //
+ // Scalar compares producing output in scc will be promoted to s32, as that
+ // is the allocatable register type that will be needed for the copy from
+ // scc. This will be promoted during RegBankSelect, and we assume something
+ // before that won't try to use s32 result types.
+ //
+ // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
+ // bank.
.legalForCartesianProduct(
{S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
- .legalFor({{S1, S32}, {S1, S64}});
+ .legalForCartesianProduct(
+ {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
if (ST.has16BitInsts()) {
CmpBuilder.legalFor({{S1, S16}});
}
@@ -516,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(1)
.clampScalar(1, S32, S64)
.scalarize(0)
- .legalIf(all(typeIs(0, S1), isPointer(1)));
+ .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
getActionDefinitionsBuilder(G_FCMP)
.legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
@@ -888,10 +900,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
// TODO: Pointer types, any 32-bit or 64-bit vector
+
+ // Condition should be s32 for scalar, s1 for vector.
getActionDefinitionsBuilder(G_SELECT)
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
- LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1})
+ LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
.clampScalar(0, S16, S64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.fewerElementsIf(numElementsNotEven(0), scalarize(0))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5c8afcee2e5..3cd2ddec5b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -17,6 +17,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -40,13 +41,15 @@ namespace {
// Observer to apply a register bank to new registers created by LegalizerHelper.
class ApplyRegBankMapping final : public GISelChangeObserver {
private:
+ const AMDGPURegisterBankInfo &RBI;
MachineRegisterInfo &MRI;
const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;
public:
- ApplyRegBankMapping(MachineRegisterInfo &MRI_, const RegisterBank *RB)
- : MRI(MRI_), NewBank(RB) {}
+ ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
+ MachineRegisterInfo &MRI_, const RegisterBank *RB)
+ : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
@@ -55,6 +58,46 @@ public:
/// Set any registers that don't have a set register class or bank to SALU.
void applyBank(MachineInstr &MI) {
+ const unsigned Opc = MI.getOpcode();
+ if (Opc == AMDGPU::G_ANYEXT || Opc == AMDGPU::G_ZEXT ||
+ Opc == AMDGPU::G_SEXT) {
+ // LegalizerHelper wants to use the basic legalization artifacts when
+ // widening etc. We don't handle selection with vcc in artifact sources,
+ // so we need to use a sslect instead to handle these properly.
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, *RBI.TRI);
+ if (SrcBank == &AMDGPU::VCCRegBank) {
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ assert(MRI.getType(SrcReg) == S1);
+ assert(MRI.getType(DstReg) == S32);
+ assert(NewBank == &AMDGPU::VGPRRegBank);
+
+ // Replace the extension with a select, which really uses the boolean
+ // source.
+ MachineIRBuilder B(MI);
+ auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
+ auto False = B.buildConstant(S32, 0);
+ B.buildSelect(DstReg, SrcReg, True, False);
+ MRI.setRegBank(True.getReg(0), *NewBank);
+ MRI.setRegBank(False.getReg(0), *NewBank);
+ MI.eraseFromParent();
+ }
+
+ assert(!MRI.getRegClassOrRegBank(DstReg));
+ MRI.setRegBank(DstReg, *NewBank);
+ return;
+ }
+
+#ifndef NDEBUG
+ if (Opc == AMDGPU::G_TRUNC) {
+ Register DstReg = MI.getOperand(0).getReg();
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, *RBI.TRI);
+ assert(DstBank != &AMDGPU::VCCRegBank);
+ }
+#endif
+
for (MachineOperand &Op : MI.operands()) {
if (!Op.isReg())
continue;
@@ -64,10 +107,14 @@ public:
continue;
const RegisterBank *RB = NewBank;
- // FIXME: This might not be enough to detect when SCC should be used.
- if (MRI.getType(Reg) == LLT::scalar(1))
- RB = (NewBank == &AMDGPU::SGPRRegBank ?
- &AMDGPU::SCCRegBank : &AMDGPU::VCCRegBank);
+ if (MRI.getType(Reg) == LLT::scalar(1)) {
+ assert(NewBank == &AMDGPU::VGPRRegBank &&
+ "s1 operands should only be used for vector bools");
+ assert((MI.getOpcode() != AMDGPU::G_TRUNC &&
+ MI.getOpcode() != AMDGPU::G_ANYEXT) &&
+ "not expecting legalization artifacts here");
+ RB = &AMDGPU::VCCRegBank;
+ }
MRI.setRegBank(Reg, *RB);
}
@@ -133,15 +180,13 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
// have been a truncate from an arbitrary value, in which case a copy (lowered
// as a compare with 0) needs to be inserted.
if (Size == 1 &&
- (Dst.getID() == AMDGPU::SCCRegBankID ||
- Dst.getID() == AMDGPU::SGPRRegBankID) &&
+ (Dst.getID() == AMDGPU::SGPRRegBankID) &&
(isVectorRegisterBank(Src) ||
Src.getID() == AMDGPU::SGPRRegBankID ||
Src.getID() == AMDGPU::VCCRegBankID))
return std::numeric_limits<unsigned>::max();
- if (Dst.getID() == AMDGPU::SCCRegBankID &&
- Src.getID() == AMDGPU::VCCRegBankID)
+ if (Src.getID() == AMDGPU::VCCRegBankID)
return std::numeric_limits<unsigned>::max();
// There is no direct copy between AGPRs.
@@ -183,12 +228,19 @@ AMDGPURegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
if (&RC == &AMDGPU::SReg_1RegClass)
return AMDGPU::VCCRegBank;
- if (TRI->isSGPRClass(&RC))
- return AMDGPU::SGPRRegBank;
- if (TRI->isAGPRClass(&RC))
- return AMDGPU::AGPRRegBank;
+ // We promote real scalar booleans to SReg_32. Any SGPR using s1 is really a
+ // VCC-like use.
+ if (TRI->isSGPRClass(&RC)) {
+ // FIXME: This probably came from a copy from a physical register, which
+ // should be inferrrable from the copied to-type. We don't have many boolean
+ // physical register constraints so just assume a normal SGPR for now.
+ if (!Ty.isValid())
+ return AMDGPU::SGPRRegBank;
+
+ return Ty == LLT::scalar(1) ? AMDGPU::VCCRegBank : AMDGPU::SGPRRegBank;
+ }
- return AMDGPU::VGPRRegBank;
+ return TRI->isAGPRClass(&RC) ? AMDGPU::AGPRRegBank : AMDGPU::VGPRRegBank;
}
template <unsigned NumOps>
@@ -383,11 +435,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
case TargetOpcode::G_CONSTANT: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (Size == 1) {
- static const OpRegBankEntry<1> Table[4] = {
+ static const OpRegBankEntry<1> Table[3] = {
{ { AMDGPU::VGPRRegBankID }, 1 },
{ { AMDGPU::SGPRRegBankID }, 1 },
- { { AMDGPU::VCCRegBankID }, 1 },
- { { AMDGPU::SCCRegBankID }, 1 }
+ { { AMDGPU::VCCRegBankID }, 1 }
};
return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
@@ -414,25 +465,17 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
// s_{and|or|xor}_b32 set scc when the result of the 32-bit op is not 0.
const InstructionMapping &SCCMapping = getInstructionMapping(
1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32)}),
3); // Num Operands
AltMappings.push_back(&SCCMapping);
- const InstructionMapping &SGPRMapping = getInstructionMapping(
- 1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
- 3); // Num Operands
- AltMappings.push_back(&SGPRMapping);
-
const InstructionMapping &VCCMapping0 = getInstructionMapping(
- 2, 10, getOperandsMapping(
+ 2, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size),
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size)}),
3); // Num Operands
AltMappings.push_back(&VCCMapping0);
return AltMappings;
@@ -513,9 +556,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
}
case TargetOpcode::G_ICMP: {
+ // TODO: Should report 32-bit for scalar output type.
unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
- getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
nullptr, // Predicate operand.
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
@@ -552,7 +596,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
4); // Num Operands
@@ -593,10 +637,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1)}),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1)}),
5); // Num Operands
AltMappings.push_back(&SSMapping);
@@ -613,9 +657,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
case AMDGPU::G_BRCOND: {
assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
+ // TODO: Change type to 32 for scalar
const InstructionMapping &SMapping = getInstructionMapping(
1, 1, getOperandsMapping(
- {AMDGPU::getValueMapping(AMDGPU::SCCRegBankID, 1), nullptr}),
+ {AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1), nullptr}),
2); // Num Operands
AltMappings.push_back(&SMapping);
@@ -1112,7 +1157,7 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
unsigned SplitElts =
MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
const LLT LoadSplitTy = LLT::vector(SplitElts, LoadTy.getScalarType());
- ApplyRegBankMapping O(MRI, &AMDGPU::VGPRRegBank);
+ ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
GISelObserverWrapper Observer(&O);
B.setChangeObserver(Observer);
LegalizerHelper Helper(B.getMF(), Observer, B);
@@ -1171,6 +1216,39 @@ bool AMDGPURegisterBankInfo::applyMappingImage(
return true;
}
+// FIXME: Duplicated from LegalizerHelper
+static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::G_SMIN:
+ return CmpInst::ICMP_SLT;
+ case TargetOpcode::G_SMAX:
+ return CmpInst::ICMP_SGT;
+ case TargetOpcode::G_UMIN:
+ return CmpInst::ICMP_ULT;
+ case TargetOpcode::G_UMAX:
+ return CmpInst::ICMP_UGT;
+ default:
+ llvm_unreachable("not in integer min/max");
+ }
+}
+
+// FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
+void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
+ MachineInstr &MI) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
+
+ const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
+ LLT CmpType = LLT::scalar(32);
+
+ auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
+ B.buildSelect(Dst, Cmp, Src0, Src1);
+
+ B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
+ MI.eraseFromParent();
+}
+
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
@@ -1366,16 +1444,122 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
switch (Opc) {
+ case AMDGPU::G_PHI: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy != LLT::scalar(1))
+ break;
+
+ const LLT S32 = LLT::scalar(32);
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::VCCRegBank) {
+ applyDefaultMapping(OpdMapper);
+ // The standard handling only considers the result register bank for
+ // phis. For VCC, blindly inserting a copy when the phi is lowered will
+ // produce an invalid copy. We can only copy with some kind of compare to
+ // get a vector boolean result. Insert a regitser bank copy that will be
+ // correctly lowered to a compare.
+ MachineIRBuilder B(*MI.getParent()->getParent());
+
+ for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
+ Register SrcReg = MI.getOperand(I).getReg();
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+
+ if (SrcBank != &AMDGPU::VCCRegBank) {
+ MachineBasicBlock *SrcMBB = MI.getOperand(I + 1).getMBB();
+ B.setInsertPt(*SrcMBB, SrcMBB->getFirstTerminator());
+
+ auto Copy = B.buildCopy(LLT::scalar(1), SrcReg);
+ MRI.setRegBank(Copy.getReg(0), AMDGPU::VCCRegBank);
+ MI.getOperand(I).setReg(Copy.getReg(0));
+ }
+ }
+
+ return;
+ }
+
+ // Phi handling is strange and only considers the bank of the destination.
+ substituteSimpleCopyRegs(OpdMapper, 0);
+
+ // Promote SGPR/VGPR booleans to s32
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
+ GISelObserverWrapper Observer(&ApplyBank);
+ MachineIRBuilder B(MI);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+
+ return;
+ }
+ case AMDGPU::G_ICMP:
+ case AMDGPU::G_UADDO:
+ case AMDGPU::G_USUBO:
+ case AMDGPU::G_UADDE:
+ case AMDGPU::G_SADDE:
+ case AMDGPU::G_USUBE:
+ case AMDGPU::G_SSUBE: {
+ unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
+ Register DstReg = MI.getOperand(BoolDstOp).getReg();
+
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank != &AMDGPU::SGPRRegBank)
+ break;
+
+ const bool HasCarryIn = MI.getNumOperands() == 5;
+
+ // If this is a scalar compare, promote the result to s32, as the selection
+ // will end up using a copy to a 32-bit vreg.
+ const LLT S32 = LLT::scalar(32);
+ Register NewDstReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
+ MI.getOperand(BoolDstOp).setReg(NewDstReg);
+ MachineIRBuilder B(MI);
+
+ if (HasCarryIn) {
+ Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(NewSrcReg, AMDGPU::SGPRRegBank);
+ B.buildZExt(NewSrcReg, MI.getOperand(4).getReg());
+ MI.getOperand(4).setReg(NewSrcReg);
+ }
+
+ MachineBasicBlock *MBB = MI.getParent();
+ B.setInsertPt(*MBB, std::next(MI.getIterator()));
+ B.buildTrunc(DstReg, NewDstReg);
+ return;
+ }
case AMDGPU::G_SELECT: {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
+
+ SmallVector<Register, 1> CondRegs(OpdMapper.getVRegs(1));
+ if (CondRegs.empty())
+ CondRegs.push_back(MI.getOperand(1).getReg());
+ else {
+ assert(CondRegs.size() == 1);
+ }
+
+ const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
+ if (CondBank == &AMDGPU::SGPRRegBank) {
+ MachineIRBuilder B(MI);
+ const LLT S32 = LLT::scalar(32);
+ Register NewCondReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
+
+ MI.getOperand(1).setReg(NewCondReg);
+ B.buildZExt(NewCondReg, CondRegs[0]);
+ }
+
if (DstTy.getSizeInBits() != 64)
break;
+ MachineIRBuilder B(MI);
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
- SmallVector<Register, 1> Src0Regs(OpdMapper.getVRegs(1));
SmallVector<Register, 2> Src1Regs(OpdMapper.getVRegs(2));
SmallVector<Register, 2> Src2Regs(OpdMapper.getVRegs(3));
@@ -1385,13 +1569,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
}
- MachineIRBuilder B(MI);
- if (Src0Regs.empty())
- Src0Regs.push_back(MI.getOperand(1).getReg());
- else {
- assert(Src0Regs.size() == 1);
- }
-
if (Src1Regs.empty())
split64BitValueForMapping(B, Src1Regs, HalfTy, MI.getOperand(2).getReg());
else {
@@ -1405,13 +1582,32 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
setRegsToType(MRI, DefRegs, HalfTy);
- B.buildSelect(DefRegs[0], Src0Regs[0], Src1Regs[0], Src2Regs[0]);
- B.buildSelect(DefRegs[1], Src0Regs[0], Src1Regs[1], Src2Regs[1]);
+ B.buildSelect(DefRegs[0], CondRegs[0], Src1Regs[0], Src2Regs[0]);
+ B.buildSelect(DefRegs[1], CondRegs[0], Src1Regs[1], Src2Regs[1]);
MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
MI.eraseFromParent();
return;
}
+ case AMDGPU::G_BRCOND: {
+ Register CondReg = MI.getOperand(0).getReg();
+ // FIXME: Should use legalizer helper, but should change bool ext type.
+ const RegisterBank *CondBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+
+ if (CondBank == &AMDGPU::SGPRRegBank) {
+ MachineIRBuilder B(MI);
+ const LLT S32 = LLT::scalar(32);
+ Register NewCondReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
+
+ MI.getOperand(0).setReg(NewCondReg);
+ B.buildZExt(NewCondReg, CondReg);
+ return;
+ }
+
+ break;
+ }
case AMDGPU::G_AND:
case AMDGPU::G_OR:
case AMDGPU::G_XOR: {
@@ -1419,6 +1615,25 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// there is a VGPR input.
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
+
+ if (DstTy.getSizeInBits() == 1) {
+ const RegisterBank *DstBank =
+ OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+ if (DstBank == &AMDGPU::VCCRegBank)
+ break;
+
+ MachineFunction *MF = MI.getParent()->getParent();
+ ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
+ GISelObserverWrapper Observer(&ApplyBank);
+ MachineIRBuilder B(MI);
+ LegalizerHelper Helper(*MF, Observer, B);
+
+ if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
+ LegalizerHelper::Legalized)
+ llvm_unreachable("widen scalar should have succeeded");
+ return;
+ }
+
if (DstTy.getSizeInBits() != 64)
break;
@@ -1484,7 +1699,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
- ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
+ ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
GISelObserverWrapper Observer(&ApplySALU);
LegalizerHelper Helper(*MF, Observer, B);
@@ -1505,9 +1720,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MachineFunction *MF = MI.getParent()->getParent();
MachineIRBuilder B(MI);
- ApplyRegBankMapping ApplySALU(MRI, &AMDGPU::SGPRRegBank);
- GISelObserverWrapper Observer(&ApplySALU);
- LegalizerHelper Helper(*MF, Observer, B);
// Turn scalar min/max into a compare and select.
LLT Ty = MRI.getType(DstReg);
@@ -1515,17 +1727,18 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
LLT S16 = LLT::scalar(16);
if (Ty == S16) {
+ ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
+ GISelObserverWrapper Observer(&ApplySALU);
+ LegalizerHelper Helper(*MF, Observer, B);
+
// Need to widen to s32, and expand as cmp + select.
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widenScalar should have succeeded");
// FIXME: This is relying on widenScalar leaving MI in place.
- if (Helper.lower(MI, 0, S32) != LegalizerHelper::Legalized)
- llvm_unreachable("lower should have succeeded");
- } else {
- if (Helper.lower(MI, 0, Ty) != LegalizerHelper::Legalized)
- llvm_unreachable("lower should have succeeded");
- }
+ lowerScalarMinMax(B, MI);
+ } else
+ lowerScalarMinMax(B, MI);
return;
}
@@ -1543,7 +1756,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
LLT DstTy = MRI.getType(DstReg);
if (DstTy.isScalar() &&
SrcBank != &AMDGPU::SGPRRegBank &&
- SrcBank != &AMDGPU::SCCRegBank &&
SrcBank != &AMDGPU::VCCRegBank &&
// FIXME: Should handle any type that round to s64 when irregular
// breakdowns supported.
@@ -1574,16 +1786,15 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (SrcTy != LLT::scalar(1))
return;
- if (SrcBank == &AMDGPU::SCCRegBank || SrcBank == &AMDGPU::VCCRegBank) {
+ if (SrcBank == &AMDGPU::VCCRegBank) {
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
- const RegisterBank *DstBank = SrcBank == &AMDGPU::SCCRegBank ?
- &AMDGPU::SGPRRegBank : &AMDGPU::VGPRRegBank;
+ const RegisterBank *DstBank = &AMDGPU::VGPRRegBank;
unsigned DstSize = DstTy.getSizeInBits();
// 64-bit select is SGPR only
const bool UseSel64 = DstSize > 32 &&
- SrcBank->getID() == AMDGPU::SCCRegBankID;
+ SrcBank->getID() == AMDGPU::SGPRRegBankID;
// TODO: Should s16 select be legal?
LLT SelType = UseSel64 ? LLT::scalar(64) : LLT::scalar(32);
@@ -1594,7 +1805,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MRI.setRegBank(False.getReg(0), *DstBank);
MRI.setRegBank(DstReg, *DstBank);
- if (DstSize > 32 && SrcBank->getID() != AMDGPU::SCCRegBankID) {
+ if (DstSize > 32) {
B.buildSelect(DefRegs[0], SrcReg, True, False);
B.buildCopy(DefRegs[1], DefRegs[0]);
} else if (DstSize < 32) {
@@ -1955,11 +2166,8 @@ bool AMDGPURegisterBankInfo::isSALUMapping(const MachineInstr &MI) const {
continue;
Register Reg = MI.getOperand(i).getReg();
if (const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI)) {
- if (isVectorRegisterBank(*Bank))
+ if (Bank->getID() != AMDGPU::SGPRRegBankID)
return false;
-
- assert(Bank->getID() == AMDGPU::SGPRRegBankID ||
- Bank->getID() == AMDGPU::SCCRegBankID);
}
}
return true;
@@ -1973,8 +2181,7 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
- unsigned BankID = Size == 1 ? AMDGPU::SCCRegBankID : AMDGPU::SGPRRegBankID;
- OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
+ OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
}
return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
MI.getNumOperands());
@@ -2228,10 +2435,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
+ // FIXME: Need to promote SGPR case to s32
unsigned OpBank = Bank->getID();
- if (OpBank == AMDGPU::SCCRegBankID)
- OpBank = AMDGPU::SGPRRegBankID;
-
ResultBank = regBankBoolUnion(ResultBank, OpBank);
}
@@ -2273,10 +2478,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
TargetBankID = AMDGPU::VCCRegBankID;
BankLHS = AMDGPU::VCCRegBankID;
BankRHS = AMDGPU::VCCRegBankID;
- } else if (DstBank == &AMDGPU::SCCRegBank) {
- TargetBankID = AMDGPU::SCCRegBankID;
- BankLHS = AMDGPU::SGPRRegBankID;
- BankRHS = AMDGPU::SGPRRegBankID;
} else {
BankLHS = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
@@ -2298,13 +2499,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
BankRHS = AMDGPU::VCCRegBankID;
} else if (BankLHS == AMDGPU::SGPRRegBankID && BankRHS == AMDGPU::SGPRRegBankID) {
TargetBankID = AMDGPU::SGPRRegBankID;
- } else if (BankLHS == AMDGPU::SCCRegBankID || BankRHS == AMDGPU::SCCRegBankID) {
- // The operation must be done on a 32-bit register, but it will set
- // scc. The result type could interchangably be SCC or SGPR, since
- // both values will be produced.
- TargetBankID = AMDGPU::SCCRegBankID;
- BankLHS = AMDGPU::SGPRRegBankID;
- BankRHS = AMDGPU::SGPRRegBankID;
}
}
@@ -2480,7 +2674,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Bank = getRegBankID(Src, MRI, *TRI);
unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
- OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
+ OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
+ AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
+ AMDGPU::getValueMapping(Bank, DstSize);
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
break;
}
@@ -2496,7 +2692,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const RegisterBank *SrcBank = getRegBank(Src, MRI, *TRI);
assert(SrcBank);
switch (SrcBank->getID()) {
- case AMDGPU::SCCRegBankID:
case AMDGPU::SGPRRegBankID:
DstBank = AMDGPU::SGPRRegBankID;
break;
@@ -2557,9 +2752,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
(Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
Subtarget.hasScalarCompareEq64()));
- unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
+
+ // TODO: Use 32-bit for scalar output size.
+ // SCC results will need to be copied to a 32-bit SGPR virtual register.
+ const unsigned ResultSize = 1;
- OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, 1);
+ OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
@@ -3010,19 +3209,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
Op3Bank == AMDGPU::SGPRRegBankID;
unsigned CondBankDefault = SGPRSrcs ?
- AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
unsigned CondBank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
CondBankDefault);
if (CondBank == AMDGPU::SGPRRegBankID)
- CondBank = SGPRSrcs ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
+ CondBank = SGPRSrcs ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
else if (CondBank == AMDGPU::VGPRRegBankID)
CondBank = AMDGPU::VCCRegBankID;
- unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SCCRegBankID ?
+ unsigned Bank = SGPRSrcs && CondBank == AMDGPU::SGPRRegBankID ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
- assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SCCRegBankID);
+ assert(CondBank == AMDGPU::VCCRegBankID || CondBank == AMDGPU::SGPRRegBankID);
+ // TODO: Should report 32-bit for scalar condition type.
if (Size == 64) {
OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(Bank, Size);
OpdsMapping[1] = AMDGPU::getValueMapping(CondBank, 1);
@@ -3062,7 +3262,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
AMDGPU::SGPRRegBankID);
assert(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() == 1);
- if (Bank != AMDGPU::SCCRegBankID)
+ if (Bank != AMDGPU::SGPRRegBankID)
Bank = AMDGPU::VCCRegBankID;
OpdsMapping[0] = AMDGPU::getValueMapping(Bank, 1);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 7ef1a3615b4..efd5d496573 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -40,10 +40,12 @@ protected:
#include "AMDGPUGenRegisterBank.inc"
};
class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+public:
const GCNSubtarget &Subtarget;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
+private:
bool collectWaterfallOperands(
SmallSet<Register, 4> &SGPROperandRegs,
MachineInstr &MI,
@@ -74,6 +76,8 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI, int RSrcIdx) const;
+ void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
+
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index ab3b176ac21..c495316c5bc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -14,8 +14,6 @@ def VGPRRegBank : RegisterBank<"VGPR",
[VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
>;
-def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
-
// It is helpful to distinguish conditions from ordinary SGPRs.
def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 5796c6e6a11..863308c76f0 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1775,14 +1775,6 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
&AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
case AMDGPU::SGPRRegBankID:
return &AMDGPU::SReg_32RegClass;
- case AMDGPU::SCCRegBankID:
- // This needs to return an allocatable class, so don't bother returning
- // the dummy SCC class.
- //
- // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this
- // was not an VCC bank value since we use the larger class SReg_32 for
- // other values. These should all use SReg_32.
- return &AMDGPU::SGPR_32RegClass;
default:
llvm_unreachable("unknown register bank");
}
OpenPOWER on IntegriCloud