summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom Stellard <thomas.stellard@amd.com>2016-08-29 19:42:52 +0000
committerTom Stellard <thomas.stellard@amd.com>2016-08-29 19:42:52 +0000
commit0d23ebe8883af4b280897af751614c7b433e00f7 (patch)
tree451e879c6008fb9fa90460d4fe9d22640f3ea01b
parent9b1669ae353e02c0fdbbe1ac181f18a453e98b35 (diff)
downloadbcm5719-llvm-0d23ebe8883af4b280897af751614c7b433e00f7.tar.gz
bcm5719-llvm-0d23ebe8883af4b280897af751614c7b433e00f7.zip
AMDGPU/SI: Implement a custom MachineSchedStrategy
Summary: GCNSchedStrategy re-uses most of GenericScheduler, it's just uses a different method to compute the excess and critical register pressure limits. It's not enabled by default, to enable it you need to pass -misched=gcn to llc. Shader DB stats: 32464 shaders in 17874 tests Totals: SGPRS: 1542846 -> 1643125 (6.50 %) VGPRS: 1005595 -> 904653 (-10.04 %) Spilled SGPRs: 29929 -> 27745 (-7.30 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 36688188 -> 37034900 (0.95 %) bytes LDS: 1913 -> 1913 (0.00 %) blocks Max Waves: 254101 -> 265125 (4.34 %) Wait states: 0 -> 0 (0.00 %) Totals from affected shaders: SGPRS: 1338220 -> 1438499 (7.49 %) VGPRS: 886221 -> 785279 (-11.39 %) Spilled SGPRs: 29869 -> 27685 (-7.31 %) Spilled VGPRs: 334 -> 352 (5.39 %) Scratch VGPRs: 1612 -> 1624 (0.74 %) dwords per thread Code Size: 34315716 -> 34662428 (1.01 %) bytes LDS: 1551 -> 1551 (0.00 %) blocks Max Waves: 188127 -> 199151 (5.86 %) Wait states: 0 -> 0 (0.00 %) Reviewers: arsenm, mareko, nhaehnle, MatzeB, atrick Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D23688 llvm-svn: 279995
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/CMakeLists.txt1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp312
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.h54
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SISchedule.td4
-rw-r--r--llvm/test/CodeGen/AMDGPU/and.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/ctpop64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fceil64.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/fma-combine.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fmax3.f64.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-constant-i16.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-global-i32.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/missing-store.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/rcp-pattern.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/ret.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/sad.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/salu-to-valu.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/select-vectors.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/trunc.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/udivrem.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll18
32 files changed, 513 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 44f59c33125..03ac8cceea3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -214,3 +214,48 @@ void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
}
+
+unsigned SISubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
+ if (getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+ if (SGPRs <= 80)
+ return 10;
+ if (SGPRs <= 88)
+ return 9;
+ if (SGPRs <= 100)
+ return 8;
+ return 7;
+ }
+ if (SGPRs <= 48)
+ return 10;
+ if (SGPRs <= 56)
+ return 9;
+ if (SGPRs <= 64)
+ return 8;
+ if (SGPRs <= 72)
+ return 7;
+ if (SGPRs <= 80)
+ return 6;
+ return 5;
+}
+
+unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
+ if (VGPRs <= 24)
+ return 10;
+ if (VGPRs <= 28)
+ return 9;
+ if (VGPRs <= 32)
+ return 8;
+ if (VGPRs <= 36)
+ return 7;
+ if (VGPRs <= 40)
+ return 6;
+ if (VGPRs <= 48)
+ return 5;
+ if (VGPRs <= 64)
+ return 4;
+ if (VGPRs <= 84)
+ return 3;
+ if (VGPRs <= 128)
+ return 2;
+ return 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index b15d359fab5..985b1ea7a2a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -429,6 +429,12 @@ public:
bool hasSGPRInitBug() const {
return SGPRInitBug;
}
+
+ /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
+ unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
+
+ /// Return the maximum number of waves per SIMD for kernels using \p VGPRs VGPRs
+ unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a86603a11ff..f144ce2eb33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -18,6 +18,7 @@
#include "AMDGPUCallLowering.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "GCNSchedStrategy.h"
#include "R600ISelLowering.h"
#include "R600InstrInfo.h"
#include "R600MachineScheduler.h"
@@ -96,6 +97,14 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
return new SIScheduleDAGMI(C);
}
+static ScheduleDAGInstrs *
+createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ ScheduleDAGMILive *DAG =
+ new ScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
+ DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ return DAG;
+}
+
static MachineSchedRegistry
R600SchedRegistry("r600", "Run R600's custom scheduler",
createR600MachineScheduler);
@@ -104,6 +113,11 @@ static MachineSchedRegistry
SISchedRegistry("si", "Run SI's custom scheduler",
createSIMachineScheduler);
+static MachineSchedRegistry
+GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
+ "Run GCN scheduler to maximize occupancy",
+ createGCNMaxOccupancyMachineScheduler);
+
static StringRef computeDataLayout(const Triple &TT) {
if (TT.getArch() == Triple::r600) {
// 32-bit pointers.
@@ -467,7 +481,7 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>();
if (ST.enableSIScheduler())
return createSIMachineScheduler(C);
- return nullptr;
+ return createGCNMaxOccupancyMachineScheduler(C);
}
bool GCNPassConfig::addPreISel() {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index c1ecf09df7a..e58e5b2f92d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -49,6 +49,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUPromoteAlloca.cpp
AMDGPURegisterInfo.cpp
GCNHazardRecognizer.cpp
+ GCNSchedStrategy.cpp
R600ClauseMergePass.cpp
R600ControlFlowFinalizer.cpp
R600EmitClauseMarkers.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
new file mode 100644
index 00000000000..62e9f526904
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -0,0 +1,312 @@
+//===-- GCNSchedStrategy.cpp - GCN Scheduler Strategy ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This contains a MachineSchedStrategy implementation for maximizing wave
+/// occupancy on GCN hardware.
+//===----------------------------------------------------------------------===//
+
+#include "GCNSchedStrategy.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+using namespace llvm;
+
+GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
+ const MachineSchedContext *C) :
+ GenericScheduler(C) { }
+
+static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
+ const MachineFunction &MF) {
+
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
+ ST.getOccupancyWithNumVGPRs(VGPRs));
+ return std::min(MinRegOccupancy,
+ ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+}
+
+void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure,
+ int VGPRPressure,
+ int SGPRExcessLimit,
+ int VGPRExcessLimit,
+ int SGPRCriticalLimit,
+ int VGPRCriticalLimit) {
+
+ Cand.SU = SU;
+ Cand.AtTop = AtTop;
+
+ // getDownwardPressure() and getUpwardPressure() make temporary changes to
+ // the the tracker, so we need to pass those function a non-const copy.
+ RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+ std::vector<unsigned> Pressure;
+ std::vector<unsigned> MaxPressure;
+
+ if (AtTop)
+ TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ else {
+ // FIXME: I think for bottom up scheduling, the register pressure is cached
+ // and can be retrieved by DAG->getPressureDif(SU).
+ TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
+ }
+
+ int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+
+ // If two instructions increase the pressure of different register sets
+ // by the same amount, the generic scheduler will prefer to schedule the
+ // instruction that increases the set with the least amount of registers,
+ // which in our case would be SGPRs. This is rarely what we want, so
+ // when we report excess/critical register pressure, we do it either
+ // only for VGPRs or only for SGPRs.
+
+ // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
+ const int MaxVGPRPressureInc = 16;
+ bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
+ bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+
+
+ // FIXME: We have to enter REG-EXCESS before we reach the actual threshold
+ // to increase the likelihood we don't go over the limits. We should improve
+ // the analysis to look through dependencies to find the path with the least
+ // register pressure.
+ // FIXME: This is also necessary, because some passes that run after
+ // scheduling and before regalloc increase register pressure.
+ const int ErrorMargin = 3;
+ VGPRExcessLimit -= ErrorMargin;
+ SGPRExcessLimit -= ErrorMargin;
+
+ // We only need to update the RPDelata for instructions that increase
+ // register pressure. Instructions that decrease or keep reg pressure
+ // the same will be marked as RegExcess in tryCandidate() when they
+ // are compared with instructions that increase the register pressure.
+ if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
+ }
+
+ if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
+ Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure = SGPRExcessLimit);
+ }
+
+ // Register pressure is considered 'CRITICAL' if it is approaching a value
+ // that would reduce the wave occupancy for the execution unit. When
+ // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
+ // has the same cost, so we don't need to prefer one over the other.
+
+ VGPRCriticalLimit -= ErrorMargin;
+ SGPRCriticalLimit -= ErrorMargin;
+
+ int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
+ int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+
+ if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else {
+ Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+ Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
+ }
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeFromQueue()
+void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
+ const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand) {
+ const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
+ const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+ ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
+ unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+ unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+ unsigned SGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
+ unsigned VGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
+ unsigned SGPRCriticalLimit = SRI->getNumSGPRsAllowed(ST, MaxWaves);
+ unsigned VGPRCriticalLimit = SRI->getNumVGPRsAllowed(MaxWaves);
+
+ ReadyQueue &Q = Zone.Available;
+ for (SUnit *SU : Q) {
+
+ SchedCandidate TryCand(ZonePolicy);
+ initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
+ SGPRPressure, VGPRPressure,
+ SGPRExcessLimit, VGPRExcessLimit,
+ SGPRCriticalLimit, VGPRCriticalLimit);
+ // Pass SchedBoundary only when comparing nodes from the same boundary.
+ SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+ GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
+ if (TryCand.Reason != NoCand) {
+ // Initialize resource delta if needed in case future heuristics query it.
+ if (TryCand.ResDelta == SchedResourceDelta())
+ TryCand.initResourceDelta(Zone.DAG, SchedModel);
+ Cand.setBest(TryCand);
+ }
+ }
+}
+
+static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
+ switch (Reason) {
+ default:
+ return Reason;
+ case GenericSchedulerBase::RegCritical:
+ case GenericSchedulerBase::RegExcess:
+ return -Reason;
+ }
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNodeBidirectional()
+SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
+ // Schedule as far as possible in the direction of no choice. This is most
+ // efficient, but also provides the best heuristics for CriticalPSets.
+ if (SUnit *SU = Bot.pickOnlyChoice()) {
+ IsTopNode = false;
+ return SU;
+ }
+ if (SUnit *SU = Top.pickOnlyChoice()) {
+ IsTopNode = true;
+ return SU;
+ }
+ // Set the bottom-up policy based on the state of the current bottom zone and
+ // the instructions outside the zone, including the top zone.
+ CandPolicy BotPolicy;
+ setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+ // Set the top-down policy based on the state of the current top zone and
+ // the instructions outside the zone, including the bottom zone.
+ CandPolicy TopPolicy;
+ setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+ // See if BotCand is still valid (because we previously scheduled from Top).
+ DEBUG(dbgs() << "Picking from Bot:\n");
+ if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+ BotCand.Policy != BotPolicy) {
+ BotCand.reset(CandPolicy());
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(BotCand));
+ }
+
+ // Check if the top Q has a better candidate.
+ DEBUG(dbgs() << "Picking from Top:\n");
+ if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+ TopCand.Policy != TopPolicy) {
+ TopCand.reset(CandPolicy());
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+ } else {
+ DEBUG(traceCandidate(TopCand));
+ }
+
+ // Pick best from BotCand and TopCand.
+ DEBUG(
+ dbgs() << "Top Cand: ";
+ traceCandidate(BotCand);
+ dbgs() << "Bot Cand: ";
+ traceCandidate(TopCand);
+ );
+ SchedCandidate Cand;
+ if (TopCand.Reason == BotCand.Reason) {
+ Cand = BotCand;
+ GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
+ TopCand.Reason = NoCand;
+ GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+ if (TopCand.Reason != NoCand) {
+ Cand.setBest(TopCand);
+ } else {
+ TopCand.Reason = TopReason;
+ }
+ } else {
+ if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = TopCand;
+ } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
+ Cand = BotCand;
+ } else {
+ int TopRank = getBidirectionalReasonRank(TopCand.Reason);
+ int BotRank = getBidirectionalReasonRank(BotCand.Reason);
+ if (TopRank > BotRank) {
+ Cand = TopCand;
+ } else {
+ Cand = BotCand;
+ }
+ }
+ }
+ DEBUG(
+ dbgs() << "Picking: ";
+ traceCandidate(Cand);
+ );
+
+ IsTopNode = Cand.AtTop;
+ return Cand.SU;
+}
+
+// This function is mostly cut and pasted from
+// GenericScheduler::pickNode()
+SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
+ if (DAG->top() == DAG->bottom()) {
+ assert(Top.Available.empty() && Top.Pending.empty() &&
+ Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+ return nullptr;
+ }
+ SUnit *SU;
+ do {
+ if (RegionPolicy.OnlyTopDown) {
+ SU = Top.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ TopCand.reset(NoPolicy);
+ pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+ assert(TopCand.Reason != NoCand && "failed to find a candidate");
+ SU = TopCand.SU;
+ }
+ IsTopNode = true;
+ } else if (RegionPolicy.OnlyBottomUp) {
+ SU = Bot.pickOnlyChoice();
+ if (!SU) {
+ CandPolicy NoPolicy;
+ BotCand.reset(NoPolicy);
+ pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+ assert(BotCand.Reason != NoCand && "failed to find a candidate");
+ SU = BotCand.SU;
+ }
+ IsTopNode = false;
+ } else {
+ SU = pickNodeBidirectional(IsTopNode);
+ }
+ } while (SU->isScheduled);
+
+ if (SU->isTopReady())
+ Top.removeReady(SU);
+ if (SU->isBottomReady())
+ Bot.removeReady(SU);
+
+ DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+ return SU;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
new file mode 100644
index 00000000000..4cfc0cea81f
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -0,0 +1,54 @@
+//===-- GCNSchedStrategy.h - GCN Scheduler Strategy -*- C++ -*-------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+
+/// This is a minimal scheduler strategy. The main difference between this
+/// and the GenericScheduler is that GCNSchedStrategy uses different
+/// heuristics to determine excess/critical pressure sets. Its goal is to
+/// maximize kernel occupancy (i.e. maximum number of waves per simd).
+class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+
+ SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+ void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
+ const RegPressureTracker &RPTracker,
+ SchedCandidate &Cand);
+
+ void initCandidate(SchedCandidate &Cand, SUnit *SU,
+ bool AtTop, const RegPressureTracker &RPTracker,
+ const SIRegisterInfo *SRI,
+ int SGPRPressure, int VGPRPressure,
+ int SGPRExcessLimit, int VGPRExcessLimit,
+ int SGPRCriticalLimit, int VGPRCriticalLimit);
+
+ void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
+ SchedBoundary *Zone, const SIRegisterInfo *SRI,
+ unsigned SGPRPressure, unsigned VGPRPressure);
+
+public:
+ GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
+
+ SUnit *pickNode(bool &IsTopNode) override;
+};
+
+} // End namespace llvm
+
+#endif // GCNSCHEDSTRATEGY_H
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 80fdd854fbd..7d84f7bec8c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -249,6 +249,12 @@ unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
return VGPRLimit;
}
+unsigned
+SIRegisterInfo::getDefaultRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const {
+ return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
+
bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
return Fn.getFrameInfo().hasStackObjects();
}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index b0e852e6127..6e66c52da6c 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -51,6 +51,8 @@ public:
unsigned getRegPressureSetLimit(const MachineFunction &MF,
unsigned Idx) const override;
+ unsigned getDefaultRegPressureSetLimit(const MachineFunction &MF,
+ unsigned Idx) const;
bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 0db92fc254f..be27966fd5f 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -47,6 +47,10 @@ def Write64Bit : SchedWrite;
class SISchedMachineModel : SchedMachineModel {
let CompleteModel = 1;
+ // MicroOpBufferSize = 1 means that instructions will always be added
+ // the ready queue when they become available. This exposes them
+ // to the register pressure analysis.
+ let MicroOpBufferSize = 1;
let IssueWidth = 1;
let PostRAScheduler = 1;
}
diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index 0046bc93826..eb0bf65d9b6 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -258,10 +258,10 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
}
; FUNC-LABEL: {{^}}v_and_multi_use_constant_i64:
-; SI: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
-; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
-; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO0:[0-9]+]]:[[HI0:[0-9]+]]{{\]}}
+; SI-DAG: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
; SI-DAG: s_movk_i32 [[KHI:s[0-9]+]], 0x11e{{$}}
+; SI-DAG: s_mov_b32 [[KLO:s[0-9]+]], 0xab19b207{{$}}
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI0]]
; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KLO]], v[[LO1]]
@@ -284,10 +284,9 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
; SI: buffer_load_dwordx2 v{{\[}}[[LO1:[0-9]+]]:[[HI1:[0-9]+]]{{\]}}
; SI-NOT: and
; SI: v_and_b32_e32 v[[RESLO0:[0-9]+]], 63, v[[LO0]]
-; SI-NOT: and
-; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
; SI: v_and_b32_e32 v[[RESLO1:[0-9]+]], 63, v[[LO1]]
; SI-NOT: and
+; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
%a = load volatile i64, i64 addrspace(1)* %aptr
@@ -486,8 +485,8 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
; low 32-bits, which is not a valid 64-bit inline immmediate.
; FUNC-LABEL: {{^}}s_and_inline_imm_f32_4.0_i64:
-; SI: s_load_dword s
; SI: s_load_dwordx2
+; SI: s_load_dword s
; SI-NOT: and
; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
; SI-NOT: and
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop64.ll b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
index d0976b7d45b..ee180323fa3 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop64.ll
@@ -155,8 +155,8 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
}
; FUNC-LABEL: {{^}}s_ctpop_i65:
-; GCN: s_and_b32
; GCN: s_bcnt1_i32_b64 [[REG0:s[0-9]+]],
+; GCN: s_and_b32
; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
; GCN: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index dab3c10d682..c028d1021a3 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -8,7 +8,7 @@
; SI-LABEL: {{^}}offset_order:
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index f6cdbb48c75..7d351d5a348 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,8 +13,10 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; CI: v_ceil_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
+; FIXME: We should be using s_addk_i32 here, but the reg allocation hints
+; are not always followed.
+; SI-DAG: s_add_i32 [[SEXP0:s[0-9]+]], [[SEXP]], 0xfffffc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP0]]
; SI-DAG: s_not_b64
; SI-DAG: s_and_b64
; SI-DAG: cmp_gt_i32
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index 19deefe4d4a..9126a47e5e6 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -548,7 +548,7 @@ define void @test_f32_interp(float addrspace(1)* %out,
; FUNC-LABEL: {{^}}test_f64_interp:
; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
-; SI: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
+; SI: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
define void @test_f64_interp(double addrspace(1)* %out,
double addrspace(1)* %in1,
double addrspace(1)* %in2,
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
index 9bbfe1e95c5..4dae566d848 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -6,8 +6,8 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
; SI-LABEL: {{^}}test_fmax3_f64:
; SI-DAG: buffer_load_dwordx2 [[REGA:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
; SI-DAG: buffer_load_dwordx2 [[REGB:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
-; SI-DAG: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_max_f64 [[REGA]], [[REGA]], [[REGB]]
+; SI: buffer_load_dwordx2 [[REGC:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
; SI: buffer_store_dwordx2 [[RESULT]],
; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index bf230b2db04..6d729f4569e 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -22,6 +22,7 @@ entry:
; XXX: Could do v_or_b32 directly
; CHECK-LABEL: {{^}}extract_w_offset_salu_use_vector:
+; CHECK: s_mov_b32 m0
; CHECK-DAG: s_or_b32
; CHECK-DAG: s_or_b32
; CHECK-DAG: s_or_b32
@@ -30,8 +31,7 @@ entry:
; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; CHECK-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
-; CHECK: s_mov_b32 m0
-; CHECK-NEXT: v_movrels_b32_e32
+; CHECK: v_movrels_b32_e32
define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
entry:
%idx = add i32 %in, 1
@@ -242,13 +242,13 @@ entry:
; FIXME: Why is vector copied in between?
; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
-; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
-; CHECK: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK: s_waitcnt vmcnt(0)
; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
@@ -303,8 +303,10 @@ bb2:
; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
; CHECK-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62
-; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
; CHECK-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]
+; CHECK: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
+; CHECK: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
; CHECK-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
@@ -324,7 +326,7 @@ bb2:
; CHECK: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
; CHECK: s_mov_b32 m0, [[READLANE]]
; CHECK: s_and_saveexec_b64 vcc, vcc
-; CHECK-NEXT: v_movreld_b32_e32 [[VEC_ELT1]], 63
+; CHECK-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63
; CHECK-NEXT: s_xor_b64 exec, exec, vcc
; CHECK: s_cbranch_execnz [[LOOP1]]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index b1405c7114b..e38ee472de8 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -344,7 +344,7 @@ endif:
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000
-; GCN: s_mov_b32 m0, [[SCALEDIDX]]
+; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, 0
; Increment to next element folded into base register, but FileCheck
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 397e17225ea..31ff7d931f1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -343,8 +343,8 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
; FUNC-LABEL: {{^}}constant_zextload_v32i16_to_v32i32:
; GCN-DAG: s_load_dwordx16
; GCN-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
+; GCN-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[K]]
+; GCN-DAG: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 0, #1
; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 5e1171a69be..86f5d4eb4d5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -360,7 +360,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
; GCN-NOHSA: buffer_load_dwordx4
-; GCN-NOHSA: buffer_load_dwordx4
+; GCN-NOHSA-DAG: buffer_load_dwordx4
; GCN-HSA: flat_load_dwordx4
; GCN-HSA: flat_load_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/missing-store.ll b/llvm/test/CodeGen/AMDGPU/missing-store.ll
index 3d6d7fae0fd..8e1b0036a1a 100644
--- a/llvm/test/CodeGen/AMDGPU/missing-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/missing-store.ll
@@ -6,12 +6,12 @@
; resulting in losing the store to gptr
; FUNC-LABEL: {{^}}missing_store_reduced:
+; SI: s_load_dwordx2
; SI: ds_read_b64
; SI-DAG: buffer_store_dword
; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
-; SI: s_load_dword
-; SI: s_nop 2
+; SI: s_nop 3
; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
; SI: buffer_store_dword
; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 36f12573c17..85dfbe6b8a3 100644
--- a/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -10,10 +10,10 @@
; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[LDPTRLO:[0-9]+]]:[[LDPTRHI:[0-9]+]]{{\]}}
; GCN-NOT: v_mov_b32
-; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
-; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 v[[VARG1LO:[0-9]+]], s[[ARG1LO]]
; GCN-NOT: v_mov_b32
+; GCN: v_mov_b32_e32 v[[VARG1HI:[0-9]+]], s[[ARG1HI]]
+; GCN-NOT: v_mov_b32
; GCN: v_add_i32_e32 v[[PTRLO:[0-9]+]], vcc, v[[LDPTRLO]], v[[VARG1LO]]
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
index 9eb76eb290d..58d411772f1 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -103,9 +103,8 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_multi_use_f32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
-; GCN: buffer_store_dword [[RCP]]
-
; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
+; GCN: buffer_store_dword [[RCP]]
; GCN: buffer_store_dword [[MUL]]
define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
%src.fabs = call float @llvm.fabs.f32(float %src)
diff --git a/llvm/test/CodeGen/AMDGPU/ret.ll b/llvm/test/CodeGen/AMDGPU/ret.ll
index 915c4383ff4..0408413f547 100644
--- a/llvm/test/CodeGen/AMDGPU/ret.ll
+++ b/llvm/test/CodeGen/AMDGPU/ret.ll
@@ -18,12 +18,13 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i
}
; GCN-LABEL: {{^}}vgpr_literal:
-; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
-; GCN: s_waitcnt expcnt(0)
+; GCN: v_mov_b32_e32 v4, v0
+; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
; GCN-DAG: v_mov_b32_e32 v3, -1.0
+; GCN: s_waitcnt expcnt(0)
; GCN-NOT: s_endpgm
define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
@@ -229,13 +230,14 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2
; GCN-LABEL: {{^}}structure_literal:
-; GCN: exp 15, 0, 1, 1, 1, v0, v0, v0, v0
-; GCN: s_waitcnt expcnt(0)
+; GCN: v_mov_b32_e32 v3, v0
+; GCN: exp 15, 0, 1, 1, 1, v3, v3, v3, v3
; GCN-DAG: v_mov_b32_e32 v0, 1.0
; GCN-DAG: s_mov_b32 s0, 2
; GCN-DAG: s_mov_b32 s1, 3
; GCN-DAG: v_mov_b32_e32 v1, 2.0
; GCN-DAG: v_mov_b32_e32 v2, 4.0
+; GCN: s_waitcnt expcnt(0)
define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 87362dc1b0f..53448340163 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -134,8 +134,8 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
; GCN-LABEL: {{^}}v_sad_u32_multi_use_select_pat2:
; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
+; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
%icmp0 = icmp ugt i32 %a, %b
%sub0 = sub i32 %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 52f3cceac2a..fd73ff86af5 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -170,14 +170,12 @@ entry:
; CI.
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
+; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
; GCN-NOHSA-NOT: v_add
; GCN-NOHSA: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
-; GCN-NOHSA-NOT: v_add
-; GCN-NOHSA: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
-; GCN-NOHSA-NOT: v_add
; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
+; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/select-vectors.ll b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
index faf8d8a12c2..d5c1dd97a73 100644
--- a/llvm/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-vectors.ll
@@ -93,13 +93,13 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
; SI-DAG: s_load_dwordx2 s{{\[}}[[ALO:[0-9]+]]:[[AHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI-DAG: s_load_dwordx2 s{{\[}}[[BLO:[0-9]+]]:[[BHI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xd|0x34}}
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[AHI]]
-; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[BHI]]
+; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[ALO]]
; SI-DAG: v_cmp_eq_i32_e64 vcc, 0, s{{[0-9]+}}
; SI: v_cndmask_b32_e32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
; SI: v_cndmask_b32_e32
; SI: buffer_store_dwordx2
define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index c5dbfd9589a..b85714ea54c 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -341,11 +341,12 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6
; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO]]{{\]}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index f6ed41d9dcd..8523f1c0c61 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -70,9 +70,9 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
}
; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
-; CI-DAG: buffer_store_dword
; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
+; CI: buffer_store_dword
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
; CI: buffer_store_dword
@@ -136,9 +136,9 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
}
; FUNC-LABEL: @reorder_global_load_local_store_global_load
+; CI: ds_write_b32
; CI: buffer_load_dword
; CI: buffer_load_dword
-; CI: ds_write_b32
; CI: buffer_store_dword
define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
@@ -181,11 +181,11 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
; FUNC-LABEL: @reorder_global_offsets
; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
; CI: s_endpgm
define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
%ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll
index dbd07fee6bb..db4c1c842d0 100644
--- a/llvm/test/CodeGen/AMDGPU/trunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/trunc.ll
@@ -35,11 +35,11 @@ define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
; SI: s_addc_u32
+; SI: buffer_store_dword v[[LO_VREG]],
; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
; SI: v_mov_b32_e32
-; SI: buffer_store_dword v[[LO_VREG]],
define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
%aa = add i64 %a, 234 ; Prevent shrinking store.
%b = shl i64 %aa, 2
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index 268f3c764d6..078df530733 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -38,16 +38,16 @@
; SI: v_cndmask_b32_e64
; SI: v_mul_hi_u32 [[Quotient:v[0-9]+]]
; SI: v_mul_lo_i32 [[Num_S_Remainder:v[0-9]+]]
+; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
; SI-DAG: v_sub_i32_e32 [[Remainder:v[0-9]+]], vcc, {{[vs][0-9]+}}, [[Num_S_Remainder]]
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
-; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
-; SI-DAG: v_add_i32_e32 [[Quotient_A_One:v[0-9]+]], vcc, 1, [[Quotient]]
; SI-DAG: v_subrev_i32_e32 [[Quotient_S_One:v[0-9]+]],
+; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
+; SI: v_and_b32_e32 [[Tmp1:v[0-9]+]]
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_add_i32_e32 [[Remainder_A_Den:v[0-9]+]],
-; SI-DAG: v_subrev_i32_e32 [[Remainder_S_Den:v[0-9]+]],
; SI-DAG: v_cndmask_b32_e64
; SI-DAG: v_cndmask_b32_e64
; SI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 7a72a4c7ba0..87fcfbade14 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -42,15 +42,18 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN: buffer_load_dword [[VA0:v[0-9]+]]
-; GCN: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
+; SI: buffer_load_dword [[VA0:v[0-9]+]]
+; SI: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-NOT: v_mov_b32
+; VI: buffer_load_dword [[VA0:v[0-9]+]]
+; VI: buffer_load_dword [[VA1:v[0-9]+]]
+
; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
; GCN: buffer_store_dword [[RESULT0]]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 7d97777a78b..52fa0bec61a 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -12,16 +12,16 @@
; GCN-LABEL: {{^}}main:
-; GCN-DAG: s_mov_b32 s13, s12
-; GCN-DAG: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s18, -1
-; SI-DAG: s_mov_b32 s19, 0xe8f000
-; VI-DAG: s_mov_b32 s19, 0xe80000
+; GCN-DAG: s_mov_b32 s11, s12
+; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s14, -1
+; SI-DAG: s_mov_b32 s15, 0xe8f000
+; VI-DAG: s_mov_b32 s15, 0xe80000
-; s13 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[16:19], s13 offset:{{[0-9]+}} ; 16-byte Folded Reload
+; s11 is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 16-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
OpenPOWER on IntegriCloud