summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td6
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h8
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp406
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td8
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td7
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td36
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td8
-rw-r--r--llvm/test/CodeGen/AMDGPU/mode-register.mir459
12 files changed, 946 insertions, 11 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b77b1f8ad79..d26397a4271 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -59,6 +59,7 @@ FunctionPass *createAMDGPUUseNativeCallsPass();
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
+FunctionPass *createSIModeRegisterPass();
void initializeAMDGPUDAGToDAGISelPass(PassRegistry&);
@@ -195,6 +196,9 @@ extern char &SIMemoryLegalizerID;
void initializeSIDebuggerInsertNopsPass(PassRegistry&);
extern char &SIDebuggerInsertNopsID;
+void initializeSIModeRegisterPass(PassRegistry&);
+extern char &SIModeRegisterID;
+
void initializeSIInsertWaitcntsPass(PassRegistry&);
extern char &SIInsertWaitcntsID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e3ab1fb8920..70d365f4ad7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -150,6 +150,13 @@ static cl::opt<bool> EnableAtomicOptimizations(
cl::init(false),
cl::Hidden);
+// Enable Mode register optimization
+static cl::opt<bool> EnableSIModeRegisterPass(
+ "amdgpu-mode-register",
+ cl::desc("Enable mode register pass"),
+ cl::init(true),
+ cl::Hidden);
+
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -189,6 +196,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
+ initializeSIModeRegisterPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);
@@ -894,6 +902,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
addPass(createSIShrinkInstructionsPass());
+ addPass(createSIModeRegisterPass());
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index bdfaabac122..7d121991482 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -120,6 +120,7 @@ add_llvm_target(AMDGPUCodeGen
SIWholeQuadMode.cpp
GCNILPSched.cpp
GCNDPPCombine.cpp
+ SIModeRegister.cpp
)
add_subdirectory(AsmParser)
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index a6d28d6999e..7f6abc34cff 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -88,7 +88,10 @@ enum : uint64_t {
IsPacked = UINT64_C(1) << 49,
// Is a D16 buffer instruction.
- D16Buf = UINT64_C(1) << 50
+ D16Buf = UINT64_C(1) << 50,
+
+ // Uses floating point double precision rounding mode
+ FPDPRounding = UINT64_C(1) << 51
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index b73d30940fc..65ffc27b8b6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -121,6 +121,10 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is a D16 buffer instruction.
field bit D16Buf = 0;
+ // This bit indicates that this uses the floating point double precision
+ // rounding mode flags
+ field bit FPDPRounding = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -178,6 +182,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{50} = D16Buf;
+ let TSFlags{51} = FPDPRounding;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index ccccd993e6a..5b1a05f3785 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -604,6 +604,14 @@ public:
return MI.getDesc().TSFlags & ClampFlags;
}
+ static bool usesFPDPRounding(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::FPDPRounding;
+ }
+
+ bool usesFPDPRounding(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::FPDPRounding;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
assert(MI.isCopy());
unsigned Dest = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
new file mode 100644
index 00000000000..c4cad95aaf7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -0,0 +1,406 @@
+//===-- SIModeRegister.cpp - Mode Register --------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This pass inserts changes to the Mode register settings as required.
+/// Note that currently it only deals with the Double Precision Floating Point
+/// rounding mode setting, but is intended to be generic enough to be easily
+/// expanded.
+///
+//===----------------------------------------------------------------------===//
+//
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <queue>
+
+#define DEBUG_TYPE "si-mode-register"
+
+STATISTIC(NumSetregInserted, "Number of setreg of mode register inserted.");
+
+using namespace llvm;
+
+struct Status {
+ // Mask is a bitmask where a '1' indicates the corresponding Mode bit has a
+ // known value
+ unsigned Mask;
+ unsigned Mode;
+
+ Status() : Mask(0), Mode(0){};
+
+ Status(unsigned Mask, unsigned Mode) : Mask(Mask), Mode(Mode) {
+ Mode &= Mask;
+ };
+
+ // merge two status values such that only values that don't conflict are
+ // preserved
+ Status merge(const Status &S) const {
+ return Status((Mask | S.Mask), ((Mode & ~S.Mask) | (S.Mode & S.Mask)));
+ }
+
+ // merge an unknown value by using the unknown value's mask to remove bits
+ // from the result
+ Status mergeUnknown(unsigned newMask) {
+ return Status(Mask & ~newMask, Mode & ~newMask);
+ }
+
+ // intersect two Status values to produce a mode and mask that is a subset
+ // of both values
+ Status intersect(const Status &S) const {
+ unsigned NewMask = (Mask & S.Mask) & (Mode ^ ~S.Mode);
+ unsigned NewMode = (Mode & NewMask);
+ return Status(NewMask, NewMode);
+ }
+
+ // produce the delta required to change the Mode to the required Mode
+ Status delta(const Status &S) const {
+ return Status((S.Mask & (Mode ^ S.Mode)) | (~Mask & S.Mask), S.Mode);
+ }
+
+ bool operator==(const Status &S) const {
+ return (Mask == S.Mask) && (Mode == S.Mode);
+ }
+
+ bool operator!=(const Status &S) const { return !(*this == S); }
+
+ bool isCompatible(Status &S) {
+ return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
+ }
+
+ bool isCombinable(Status &S) {
+ return !(Mask & S.Mask) || isCompatible(S);
+ }
+};
+
+class BlockData {
+public:
+ // The Status that represents the mode register settings required by the
+ // FirstInsertionPoint (if any) in this block. Calculated in Phase 1.
+ Status Require;
+
+ // The Status that represents the net changes to the Mode register made by
+ // this block, Calculated in Phase 1.
+ Status Change;
+
+ // The Status that represents the mode register settings on exit from this
+ // block. Calculated in Phase 2.
+ Status Exit;
+
+ // The Status that represents the intersection of exit Mode register settings
+ // from all predecessor blocks. Calculated in Phase 2, and used by Phase 3.
+ Status Pred;
+
+ // In Phase 1 we record the first instruction that has a mode requirement,
+ // which is used in Phase 3 if we need to insert a mode change.
+ MachineInstr *FirstInsertionPoint;
+
+ BlockData() : FirstInsertionPoint(nullptr) {};
+};
+
+namespace {
+
+class SIModeRegister : public MachineFunctionPass {
+public:
+ static char ID;
+
+ std::vector<std::unique_ptr<BlockData>> BlockInfo;
+ std::queue<MachineBasicBlock *> Phase2List;
+
+ // The default mode register setting currently only caters for the floating
+ // point double precision rounding mode.
+ // We currently assume the default rounding mode is Round to Nearest
+ // NOTE: this should come from a per function rounding mode setting once such
+ // a setting exists.
+ unsigned DefaultMode = FP_ROUND_ROUND_TO_NEAREST;
+ Status DefaultStatus =
+ Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
+
+public:
+ SIModeRegister() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ void processBlockPhase1(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ void processBlockPhase2(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ void processBlockPhase3(MachineBasicBlock &MBB, const SIInstrInfo *TII);
+
+ Status getInstructionMode(MachineInstr &MI, const SIInstrInfo *TII);
+
+ void insertSetreg(MachineBasicBlock &MBB, MachineInstr *I,
+ const SIInstrInfo *TII, Status InstrMode);
+};
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIModeRegister, DEBUG_TYPE,
+ "Insert required mode register values", false, false)
+
+char SIModeRegister::ID = 0;
+
+char &llvm::SIModeRegisterID = SIModeRegister::ID;
+
+FunctionPass *llvm::createSIModeRegisterPass() { return new SIModeRegister(); }
+
+// Determine the Mode register setting required for this instruction.
+// Instructions which don't use the Mode register return a null Status.
+// Note this currently only deals with instructions that use the floating point
+// double precision setting.
+Status SIModeRegister::getInstructionMode(MachineInstr &MI,
+ const SIInstrInfo *TII) {
+ if (TII->usesFPDPRounding(MI)) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::V_INTERP_P1LL_F16:
+ case AMDGPU::V_INTERP_P1LV_F16:
+ case AMDGPU::V_INTERP_P2_F16:
+ // f16 interpolation instructions need double precision round to zero
+ return Status(FP_ROUND_MODE_DP(3),
+ FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_ZERO));
+ default:
+ return DefaultStatus;
+ }
+ }
+ return Status();
+}
+
+// Insert a setreg instruction to update the Mode register.
+// It is possible (though unlikely) for an instruction to require a change to
+// the value of disjoint parts of the Mode register when we don't know the
+// value of the intervening bits. In that case we need to use more than one
+// setreg instruction.
+void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
+ const SIInstrInfo *TII, Status InstrMode) {
+ while (InstrMode.Mask) {
+ unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
+ unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
+ unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
+ BuildMI(MBB, MI, 0, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+ .addImm(Value)
+ .addImm(((Width - 1) << AMDGPU::Hwreg::WIDTH_M1_SHIFT_) |
+ (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
+ ++NumSetregInserted;
+ InstrMode.Mask &= ~((1 << Width) - 1) << Offset;
+ }
+}
+
+// In Phase 1 we iterate through the instructions of the block and for each
+// instruction we get its mode usage. If the instruction uses the Mode register
+// we:
+// - update the Change status, which tracks the changes to the Mode register
+// made by this block
+// - if this instruction's requirements are compatible with the current setting
+// of the Mode register we merge the modes
+// - if it isn't compatible and an InsertionPoint isn't set, then we set the
+// InsertionPoint to the current instruction, and we remember the current
+// mode
+// - if it isn't compatible and InsertionPoint is set we insert a seteg before
+// that instruction (unless this instruction forms part of the block's
+// entry requirements in which case the insertion is deferred until Phase 3
+// when predecessor exit values are known), and move the insertion point to
+// this instruction
+// - if this is a setreg instruction we treat it as an incompatible instruction.
+// This is sub-optimal but avoids some nasty corner cases, and is expected to
+// occur very rarely.
+// - on exit we have set the Require, Change, and initial Exit modes.
+void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+ auto NewInfo = llvm::make_unique<BlockData>();
+ MachineInstr *InsertionPoint = nullptr;
+ // RequirePending is used to indicate whether we are collecting the initial
+ // requirements for the block, and need to defer the first InsertionPoint to
+ // Phase 3. It is set to false once we have set FirstInsertionPoint, or when
+ // we discover an explict setreg that means this block doesn't have any
+ // initial requirements.
+ bool RequirePending = true;
+ Status IPChange;
+ for (MachineInstr &MI : MBB) {
+ Status InstrMode = getInstructionMode(MI, TII);
+ if ((MI.getOpcode() == AMDGPU::S_SETREG_B32) ||
+ (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32)) {
+ // We preserve any explicit mode register setreg instruction we encounter,
+ // as we assume it has been inserted by a higher authority (this is
+ // likely to be a very rare occurrence).
+ unsigned Dst = TII->getNamedOperand(MI, AMDGPU::OpName::simm16)->getImm();
+ if (((Dst & AMDGPU::Hwreg::ID_MASK_) >> AMDGPU::Hwreg::ID_SHIFT_) !=
+ AMDGPU::Hwreg::ID_MODE)
+ continue;
+
+ unsigned Width = ((Dst & AMDGPU::Hwreg::WIDTH_M1_MASK_) >>
+ AMDGPU::Hwreg::WIDTH_M1_SHIFT_) +
+ 1;
+ unsigned Offset =
+ (Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
+ unsigned Mask = ((1 << Width) - 1) << Offset;
+
+ // If an InsertionPoint is set we will insert a setreg there.
+ if (InsertionPoint) {
+ insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+ InsertionPoint = nullptr;
+ }
+ // If this is an immediate then we know the value being set, but if it is
+ // not an immediate then we treat the modified bits of the mode register
+ // as unknown.
+ if (MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32) {
+ unsigned Val = TII->getNamedOperand(MI, AMDGPU::OpName::imm)->getImm();
+ unsigned Mode = (Val << Offset) & Mask;
+ Status Setreg = Status(Mask, Mode);
+ // If we haven't already set the initial requirements for the block we
+ // don't need to as the requirements start from this explicit setreg.
+ RequirePending = false;
+ NewInfo->Change = NewInfo->Change.merge(Setreg);
+ } else {
+ NewInfo->Change = NewInfo->Change.mergeUnknown(Mask);
+ }
+ } else if (!NewInfo->Change.isCompatible(InstrMode)) {
+ // This instruction uses the Mode register and its requirements aren't
+ // compatible with the current mode.
+ if (InsertionPoint) {
+ // If the required mode change cannot be included in the current
+ // InsertionPoint changes, we need a setreg and start a new
+ // InsertionPoint.
+ if (!IPChange.delta(NewInfo->Change).isCombinable(InstrMode)) {
+ if (RequirePending) {
+ // This is the first insertionPoint in the block so we will defer
+ // the insertion of the setreg to Phase 3 where we know whether or
+ // not it is actually needed.
+ NewInfo->FirstInsertionPoint = InsertionPoint;
+ NewInfo->Require = NewInfo->Change;
+ RequirePending = false;
+ } else {
+ insertSetreg(MBB, InsertionPoint, TII,
+ IPChange.delta(NewInfo->Change));
+ IPChange = NewInfo->Change;
+ }
+ // Set the new InsertionPoint
+ InsertionPoint = &MI;
+ }
+ NewInfo->Change = NewInfo->Change.merge(InstrMode);
+ } else {
+ // No InsertionPoint is currently set - this is either the first in
+ // the block or we have previously seen an explicit setreg.
+ InsertionPoint = &MI;
+ IPChange = NewInfo->Change;
+ NewInfo->Change = NewInfo->Change.merge(InstrMode);
+ }
+ }
+ }
+ if (RequirePending) {
+ // If we haven't yet set the initial requirements for the block we set them
+ // now.
+ NewInfo->FirstInsertionPoint = InsertionPoint;
+ NewInfo->Require = NewInfo->Change;
+ } else if (InsertionPoint) {
+ // We need to insert a setreg at the InsertionPoint
+ insertSetreg(MBB, InsertionPoint, TII, IPChange.delta(NewInfo->Change));
+ }
+ NewInfo->Exit = NewInfo->Change;
+ BlockInfo[MBB.getNumber()] = std::move(NewInfo);
+}
+
+// In Phase 2 we revisit each block and calculate the common Mode register
+// value provided by all predecessor blocks. If the Exit value for the block
+// is changed, then we add the successor blocks to the worklist so that the
+// exit value is propagated.
+void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+// BlockData *BI = BlockInfo[MBB.getNumber()];
+ unsigned ThisBlock = MBB.getNumber();
+ if (MBB.pred_empty()) {
+ // There are no predecessors, so use the default starting status.
+ BlockInfo[ThisBlock]->Pred = DefaultStatus;
+ } else {
+ // Build a status that is common to all the predecessors by intersecting
+ // all the predecessor exit status values.
+ MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
+ MachineBasicBlock &PB = *(*P);
+ BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+
+ for (P = std::next(P); P != E; P = std::next(P)) {
+ MachineBasicBlock *Pred = *P;
+ BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+ }
+ }
+ Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+ if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
+ BlockInfo[ThisBlock]->Exit = TmpStatus;
+ // Add the successors to the work list so we can propagate the changed exit
+ // status.
+ for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+ E = MBB.succ_end();
+ S != E; S = std::next(S)) {
+ MachineBasicBlock &B = *(*S);
+ Phase2List.push(&B);
+ }
+ }
+}
+
+// In Phase 3 we revisit each block and if it has an insertion point defined we
+// check whether the predecessor mode meets the block's entry requirements. If
+// not we insert an appropriate setreg instruction to modify the Mode register.
+void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
+ const SIInstrInfo *TII) {
+// BlockData *BI = BlockInfo[MBB.getNumber()];
+ unsigned ThisBlock = MBB.getNumber();
+ if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
+ Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+ if (BlockInfo[ThisBlock]->FirstInsertionPoint)
+ insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
+ else
+ insertSetreg(MBB, &MBB.instr_front(), TII, Delta);
+ }
+}
+
+bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
+ BlockInfo.resize(MF.getNumBlockIDs());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Processing is performed in a number of phases
+
+ // Phase 1 - determine the initial mode required by each block, and add setreg
+ // instructions for intra block requirements.
+ for (MachineBasicBlock &BB : MF)
+ processBlockPhase1(BB, TII);
+
+ // Phase 2 - determine the exit mode from each block. We add all blocks to the
+ // list here, but will also add any that need to be revisited during Phase 2
+ // processing.
+ for (MachineBasicBlock &BB : MF)
+ Phase2List.push(&BB);
+ while (!Phase2List.empty()) {
+ processBlockPhase2(*Phase2List.front(), TII);
+ Phase2List.pop();
+ }
+
+ // Phase 3 - add an initial setreg to each block where the required entry mode
+ // is not satisfied by the exit mode of all its predecessors.
+ for (MachineBasicBlock &BB : MF)
+ processBlockPhase3(BB, TII);
+
+ BlockInfo.clear();
+
+ return NumSetregInserted > 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 9da99d9f63e..68446ab7972 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -179,7 +179,9 @@ defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
+let FPDPRounding = 1 in {
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+} // End FPDPRounding = 1
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
@@ -232,7 +234,9 @@ defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
defm V_FREXP_MANT_F64 : VOP1Inst <"v_frexp_mant_f64", VOP_F64_F64, int_amdgcn_frexp_mant>;
+let FPDPRounding = 1 in {
defm V_FRACT_F64 : VOP1Inst <"v_fract_f64", VOP_F64_F64, AMDGPUfract>;
+} // End FPDPRounding = 1
} // End SchedRW = [WriteDoubleAdd]
defm V_FREXP_EXP_I32_F32 : VOP1Inst <"v_frexp_exp_i32_f32", VOP_I32_F32, int_amdgcn_frexp_exp>;
@@ -339,8 +343,10 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
let SubtargetPredicate = Has16BitInsts in {
+let FPDPRounding = 1 in {
defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
+} // End FPDPRounding = 1
defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
let SchedRW = [WriteQuarterRate32] in {
@@ -358,7 +364,9 @@ defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst <"v_trunc_f16", VOP_F16_F16, ftrunc>;
defm V_RNDNE_F16 : VOP1Inst <"v_rndne_f16", VOP_F16_F16, frint>;
+let FPDPRounding = 1 in {
defm V_FRACT_F16 : VOP1Inst <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
+} // End FPDPRounding = 1
}
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 1bea9c367b4..e3fd7b5f9fa 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -555,18 +555,23 @@ def : divergent_i64_BinOp <xor, V_XOR_B32_e32>;
let SubtargetPredicate = Has16BitInsts in {
+let FPDPRounding = 1 in {
def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
+defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+} // End FPDPRounding = 1
+
defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
-defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
let isCommutable = 1 in {
+let FPDPRounding = 1 in {
defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+} // End FPDPRounding = 1
defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 1f88a240ecb..4b8c1f208a0 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -220,7 +220,8 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
// VOP3 INTERP
//===----------------------------------------------------------------------===//
-class VOP3Interp<string OpName, VOPProfile P> : VOP3_Pseudo<OpName, P> {
+class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
+ VOP3_Pseudo<OpName, P, pattern> {
let AsmMatchConverter = "cvtVOP3Interp";
}
@@ -292,9 +293,11 @@ def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
let SchedRW = [WriteDoubleAdd] in {
+let FPDPRounding = 1 in {
def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
+} // End FPDPRounding = 1
def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
def V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like, 1>;
} // End SchedRW = [WriteDoubleAdd]
@@ -324,6 +327,7 @@ def V_DIV_FMAS_F32 : VOP3_Pseudo <"v_div_fmas_f32", VOP_F32_F32_F32_F32_VCC,
def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC,
getVOP3VCC<VOP_F64_F64_F64_F64_VCC, AMDGPUdiv_fmas>.ret> {
let SchedRW = [WriteDouble];
+ let FPDPRounding = 1;
}
} // End Uses = [VCC, EXEC]
@@ -354,10 +358,10 @@ def V_SAD_U32 : VOP3Inst <"v_sad_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CL
def V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_I32>, int_amdgcn_cvt_pk_u8_f32>;
def V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUdiv_fixup>;
-let SchedRW = [WriteDoubleAdd] in {
+let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
-} // End SchedRW = [WriteDoubleAdd]
+} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
let SchedRW = [WriteFloatFMA, WriteSALU];
@@ -368,6 +372,7 @@ def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32,
def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
let SchedRW = [WriteDouble, WriteSALU];
let AsmMatchConverter = "";
+ let FPDPRounding = 1;
}
def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -431,39 +436,51 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
def V_DIV_FIXUP_F16 : VOP3Inst <"v_div_fixup_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUdiv_fixup> {
let Predicates = [Has16BitInsts, isVIOnly];
+ let FPDPRounding = 1;
}
def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup> {
let renamedInGFX9 = 1;
let Predicates = [Has16BitInsts, isGFX9];
+ let FPDPRounding = 1;
}
def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
let Predicates = [Has16BitInsts, isVIOnly];
+ let FPDPRounding = 1;
}
def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
let renamedInGFX9 = 1;
let Predicates = [Has16BitInsts, isGFX9];
+ let FPDPRounding = 1;
}
let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
let renamedInGFX9 = 1 in {
-def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
+let FPDPRounding = 1 in {
+def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
+let Uses = [M0, EXEC] in {
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>>;
-}
+} // End Uses = [M0, EXEC]
+} // End FPDPRounding = 1
+} // End renamedInGFX9 = 1
let SubtargetPredicate = isGFX9 in {
-def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+def V_MAD_F16_gfx9 : VOP3Inst <"v_mad_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>> {
+ let FPDPRounding = 1;
+}
def V_MAD_U16_gfx9 : VOP3Inst <"v_mad_u16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_MAD_I16_gfx9 : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>>;
def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
} // End SubtargetPredicate = isGFX9
+let Uses = [M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -845,12 +862,15 @@ defm V_FMA_F16 : VOP3_F16_Real_vi <0x1ee>;
defm V_DIV_FIXUP_F16 : VOP3_F16_Real_vi <0x1ef>;
defm V_INTERP_P2_F16 : VOP3Interp_F16_Real_vi <0x276>;
+let FPDPRounding = 1 in {
defm V_MAD_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ea, "V_MAD_F16", "v_mad_legacy_f16">;
-defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
-defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
defm V_FMA_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ee, "V_FMA_F16", "v_fma_legacy_f16">;
defm V_DIV_FIXUP_LEGACY_F16 : VOP3_F16_Real_gfx9 <0x1ef, "V_DIV_FIXUP_F16", "v_div_fixup_legacy_f16">;
defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16", "v_interp_p2_legacy_f16">;
+} // End FPDPRounding = 1
+
+defm V_MAD_LEGACY_U16 : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16", "v_mad_legacy_u16">;
+defm V_MAD_LEGACY_I16 : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16", "v_mad_legacy_i16">;
defm V_MAD_F16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
defm V_MAD_U16_gfx9 : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 2efd28b9cd8..0d25a86da32 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -42,12 +42,14 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
}
let isCommutable = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+let FPDPRounding = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+} // End FPDPRounding = 1
def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
@@ -137,12 +139,14 @@ let SubtargetPredicate = HasMadMixInsts in {
let isCommutable = 1 in {
def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
let ClampLo = 0, ClampHi = 1 in {
def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
}
+} // End FPDPRounding = 1
}
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
@@ -154,12 +158,14 @@ let SubtargetPredicate = HasFmaMixInsts in {
let isCommutable = 1 in {
def V_FMA_MIX_F32 : VOP3_VOP3PInst<"v_fma_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
+let FPDPRounding = 1 in {
// Clamp modifier is applied after conversion to f16.
def V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
let ClampLo = 0, ClampHi = 1 in {
def V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, 1>;
}
+} // End FPDPRounding = 1
}
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
diff --git a/llvm/test/CodeGen/AMDGPU/mode-register.mir b/llvm/test/CodeGen/AMDGPU/mode-register.mir
new file mode 100644
index 00000000000..bf06a01d4c3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/mode-register.mir
@@ -0,0 +1,459 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-mode-register %s -o - | FileCheck %s
+
+---
+# check that the mode is changed to rtz from default rtn for interp f16
+# CHECK-LABEL: name: interp_f16_default
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_INTERP_P1LL_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: interp_f16_default
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ $m0 = S_MOV_B32 killed $sgpr2
+ $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+ $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that the mode is not changed for interp f16 when the mode is already RTZ
+# CHECK-LABEL: name: interp_f16_explicit_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_MOV_B32_e32
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: interp_f16_explicit_rtz
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_SETREG_IMM32_B32 3, 2177
+ $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+ $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that explicit RTN mode change is registered
+# CHECK-LABEL: name: explicit_rtn
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: V_INTERP_P1LL_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_ADD_F16_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: explicit_rtn
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2
+ $m0 = S_MOV_B32 killed $sgpr2
+ $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit $exec
+ $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+ $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+ S_SETREG_IMM32_B32 0, 2177
+ $vgpr0 = V_ADD_F16_e32 killed $vgpr1, killed $vgpr0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that the mode is unchanged from RTN for F64 instruction
+# CHECK-LABEL: name: rtn_default
+# CHECK-LABEL: bb.0:
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK: V_FRACT_F64
+
+name: rtn_default
+
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2
+ $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+ S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# CHECK-LABEL: name: rtn_from_rtz
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NEXT: S_SETREG_IMM32_B32 0, 2177
+# CHECK-NEXT: V_FRACT_F64
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: rtn_from_rtz
+
+body: |
+ bb.0:
+ liveins: $vgpr1_vgpr2
+ S_SETREG_IMM32_B32 3, 2177
+ $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+ S_ENDPGM
+...
+---
+# CHECK-LABEL: name: rtz_from_rtn
+# CHECK-LABEL: bb.1:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: rtz_from_rtn
+
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $vgpr1_vgpr2
+ $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that the mode is changed from RTZ to RTN for F64 instruction
+# and back again for remaining interp instruction
+# CHECK-LABEL: name: interp_f16_plus_sqrt_f64
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+
+name: interp_f16_plus_sqrt_f64
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ $m0 = S_MOV_B32 killed $sgpr2
+ $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+ $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that an explicit change to the single precision mode has no effect
+# CHECK-LABEL: name: single_precision_mode_change
+# CHECK-LABEL: bb.0:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P1LL_F16
+# CHECK: V_INTERP_P2_F16
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P2_F16
+
+name: single_precision_mode_change
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ $m0 = S_MOV_B32 killed $sgpr2
+ $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_SETREG_IMM32_B32 2, 2049
+ $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $exec
+ $vgpr0 = V_INTERP_P1LL_F16 0, killed $vgpr0, 2, 1, -1, 0, 0, implicit $m0, implicit $exec
+ $vgpr1 = V_INTERP_P2_F16 0, $vgpr2, 2, 1, 0, killed $vgpr1, 0, 0, implicit $m0, implicit $exec
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ $vgpr0 = V_INTERP_P2_F16 0, killed $vgpr2, 2, 1, 0, killed $vgpr0, -1, 0, implicit $m0, implicit $exec
+ $vgpr0 = V_ADD_F16_e32 killed $sgpr0, killed $vgpr0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that mode is propagated back to start of loop - first instruction is RTN but needs
+# setreg as RTZ is set in loop
+# CHECK-LABEL: name: loop
+# CHECK-LABEL: bb.1:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64
+# CHECK-LABEL: bb.2:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: loop
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.1, %bb.3
+ $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ S_BRANCH %bb.3
+
+ bb.3:
+ S_ENDPGM
+...
+---
+# two back-edges to same node with different modes
+# CHECK-LABEL: name: double_loop
+# CHECK-NOT: S_SETREG_IMM32_B32
+# CHECK-LABEL: bb.2:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-LABEL: bb.4:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+
+name: double_loop
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ S_NOP 1
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.1, %bb.3
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4
+ S_NOP 1
+ S_BRANCH %bb.4
+
+ bb.4:
+ successors: %bb.5
+ S_NOP 1
+ S_BRANCH %bb.5
+
+ bb.5:
+ successors: %bb.1, %bb.6
+ S_SETREG_IMM32_B32 3, 2177
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ S_BRANCH %bb.6
+
+ bb.6:
+ S_ENDPGM
+...
+---
+# check that mode is propagated back to start of loop and through a block that
+# neither sets or uses the mode.
+# CHECK-LABEL: name: loop_indirect
+# CHECK_NOT: S_SETREG_IMM32_B32
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 3, 2177
+# CHECK: V_INTERP_P1LL_F16
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: loop_indirect
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ S_NOP 1
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+ S_NOP 1
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.1, %bb.4
+ $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_CBRANCH_VCCZ %bb.1, implicit $vcc
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM
+...
+---
+# check that multiple mode values are propagated to a block that uses the mode
+# CHECK-LABEL: name: multiple_mode_direct
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: multiple_mode_direct
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.3
+ S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ S_BRANCH %bb.3
+
+ bb.2:
+ successors: %bb.3
+ S_SETREG_IMM32_B32 3, 2177
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM
+...
+---
+# check that multiple mode values are propagated through a block that neither
+# sets or uses the mode.
+# CHECK-LABEL: name: multiple_mode_indirect
+# CHECK-LABEL: bb.4:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: multiple_mode_indirect
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.3
+ S_CBRANCH_VCCZ %bb.2, implicit $vcc
+ S_BRANCH %bb.3
+
+ bb.2:
+ successors: %bb.3
+ S_SETREG_IMM32_B32 3, 2177
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4
+ S_NOP 1
+ S_BRANCH %bb.4
+
+ bb.4:
+ successors: %bb.5
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ S_BRANCH %bb.5
+
+ bb.5:
+ S_ENDPGM
+...
+---
+# CHECK-LABEL: name: pass_through_blocks
+# CHECK-LABEL: bb.0:
+# CHECK: V_FRACT_F64_e32
+# CHECK-NEXT: S_SETREG_IMM32_B32 3, 2177
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: pass_through_blocks
+
+body: |
+ bb.0:
+ successors: %bb.1
+ liveins: $vgpr1_vgpr2
+ $vgpr1_vgpr2 = V_FRACT_F64_e32 killed $vgpr1_vgpr2, implicit $exec
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4
+ S_BRANCH %bb.4
+
+ bb.4:
+ $vgpr1 = V_INTERP_P1LL_F16 0, $vgpr0, 2, 1, 0, 0, 0, implicit $m0, implicit $exec
+ S_ENDPGM
+...
+---
+# check that multiple mode values are propagated
+# CHECK-LABEL: name: if_then_else
+# CHECK-LABEL: bb.3:
+# CHECK: S_SETREG_IMM32_B32 0, 2177
+# CHECK: V_FRACT_F64_e32
+# CHECK-NOT: S_SETREG_IMM32_B32
+
+name: if_then_else
+
+body: |
+ bb.0:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $vgpr3, $vgpr4
+ successors: %bb.1
+ $m0 = S_MOV_B32 killed $sgpr2
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2, %bb.3
+ S_CBRANCH_VCCZ %bb.3, implicit $vcc
+ S_BRANCH %bb.2
+
+ bb.2:
+ successors: %bb.3
+ S_SETREG_IMM32_B32 3, 2177
+ S_BRANCH %bb.3
+
+ bb.3:
+ successors: %bb.4
+ $vgpr3_vgpr4 = V_FRACT_F64_e32 killed $vgpr3_vgpr4, implicit $exec
+ S_BRANCH %bb.4
+
+ bb.4:
+ S_ENDPGM
+...
OpenPOWER on IntegriCloud