summaryrefslogtreecommitdiffstats
path: root/llvm/tools/llvm-exegesis/lib/Uops.cpp
diff options
context:
space:
mode:
authorClement Courbet <courbet@google.com>2018-05-17 08:12:29 +0000
committerClement Courbet <courbet@google.com>2018-05-17 08:12:29 +0000
commit295a554ce4a29ab2ef5e9ab538ac83596a536635 (patch)
tree2bb66578ef79379869c71cd47850b712d03ac02e /llvm/tools/llvm-exegesis/lib/Uops.cpp
parentee110fb735651d63252f70104d39c5a0376ee20d (diff)
downloadbcm5719-llvm-295a554ce4a29ab2ef5e9ab538ac83596a536635.tar.gz
bcm5719-llvm-295a554ce4a29ab2ef5e9ab538ac83596a536635.zip
Revert r332579 "[llvm-exegesis] Update to cover latency through another opcode."
The revision failed to update the ARM tests. llvm-svn: 332580
Diffstat (limited to 'llvm/tools/llvm-exegesis/lib/Uops.cpp')
-rw-r--r--llvm/tools/llvm-exegesis/lib/Uops.cpp351
1 files changed, 155 insertions, 196 deletions
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp
index 94282862989..59d2fc408fb 100644
--- a/llvm/tools/llvm-exegesis/lib/Uops.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp
@@ -8,130 +8,29 @@
//===----------------------------------------------------------------------===//
#include "Uops.h"
-
-#include "Assembler.h"
-#include "BenchmarkRunner.h"
-#include "MCInstrDescView.h"
+#include "BenchmarkResult.h"
+#include "InstructionSnippetGenerator.h"
#include "PerfHelper.h"
-
-// FIXME: Load constants into registers (e.g. with fld1) to not break
-// instructions like x87.
-
-// Ideally we would like the only limitation on executing uops to be the issue
-// ports. Maximizing port pressure increases the likelihood that the load is
-// distributed evenly across possible ports.
-
-// To achieve that, one approach is to generate instructions that do not have
-// data dependencies between them.
-//
-// For some instructions, this is trivial:
-// mov rax, qword ptr [rsi]
-// mov rax, qword ptr [rsi]
-// mov rax, qword ptr [rsi]
-// mov rax, qword ptr [rsi]
-// For the above snippet, haswell just renames rax four times and executes the
-// four instructions two at a time on P23 and P0126.
-//
-// For some instructions, we just need to make sure that the source is
-// different from the destination. For example, IDIV8r reads from GPR and
-// writes to AX. We just need to ensure that the Var is assigned a
-// register which is different from AX:
-// idiv bx
-// idiv bx
-// idiv bx
-// idiv bx
-// The above snippet will be able to fully saturate the ports, while the same
-// with ax would issue one uop every `latency(IDIV8r)` cycles.
-//
-// Some instructions make this harder because they both read and write from
-// the same register:
-// inc rax
-// inc rax
-// inc rax
-// inc rax
-// This has a data dependency from each instruction to the next, limit the
-// number of instructions that can be issued in parallel.
-// It turns out that this is not a big issue on recent Intel CPUs because they
-// have heuristics to balance port pressure. In the snippet above, subsequent
-// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
-// might end up executing them all on P0 (just because they can), or try
-// avoiding P5 because it's usually under high pressure from vector
-// instructions.
-// This issue is even more important for high-latency instructions because
-// they increase the idle time of the CPU, e.g. :
-// imul rax, rbx
-// imul rax, rbx
-// imul rax, rbx
-// imul rax, rbx
-//
-// To avoid that, we do the renaming statically by generating as many
-// independent exclusive assignments as possible (until all possible registers
-// are exhausted) e.g.:
-// imul rax, rbx
-// imul rcx, rbx
-// imul rdx, rbx
-// imul r8, rbx
-//
-// Some instruction even make the above static renaming impossible because
-// they implicitly read and write from the same operand, e.g. ADC16rr reads
-// and writes from EFLAGS.
-// In that case we just use a greedy register assignment and hope for the
-// best.
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <random>
+#include <unordered_map>
+#include <unordered_set>
namespace exegesis {
-static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
- return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
-}
-
-// FIXME: Handle memory, see PR36905.
-static bool hasMemoryOperand(const llvm::MCOperandInfo &OpInfo) {
- return OpInfo.OperandType == llvm::MCOI::OPERAND_MEMORY;
-}
-
-static bool isInfeasible(const Instruction &Instruction, std::string &Error) {
- const auto &MCInstrDesc = Instruction.Description;
- if (MCInstrDesc.isPseudo()) {
- Error = "is pseudo";
- return true;
- }
- if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand)) {
- Error = "has unknown operands";
- return true;
- }
- if (llvm::any_of(MCInstrDesc.operands(), hasMemoryOperand)) {
- Error = "has memory operands";
+// FIXME: Handle memory (see PR36906)
+static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
+ switch (OpInfo.OperandType) {
+ default:
return true;
+ case llvm::MCOI::OPERAND_IMMEDIATE:
+ case llvm::MCOI::OPERAND_REGISTER:
+ return false;
}
- return false;
-}
-
-// Returns whether this Variable ties Use and Def operands together.
-static bool hasTiedOperands(const Variable *Var) {
- bool HasUse = false;
- bool HasDef = false;
- for (const Operand *Op : Var->TiedOperands) {
- if (Op->IsDef)
- HasDef = true;
- else
- HasUse = true;
- }
- return HasUse && HasDef;
-}
-
-static llvm::SmallVector<Variable *, 8>
-getTiedVariables(const Instruction &Instruction) {
- llvm::SmallVector<Variable *, 8> Result;
- for (auto *Var : Instruction.Variables)
- if (hasTiedOperands(Var))
- Result.push_back(Var);
- return Result;
-}
-
-static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
- assert(a.size() == b.size());
- for (auto I : b.set_bits())
- a.reset(I);
}
static llvm::Error makeError(llvm::Twine Msg) {
@@ -139,89 +38,153 @@ static llvm::Error makeError(llvm::Twine Msg) {
llvm::inconvertibleErrorCode());
}
+static std::vector<llvm::MCInst> generateIndependentAssignments(
+ const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
+ llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
+ std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
+ for (const Variable &Var : Vars) {
+ if (Var.IsUse) {
+ IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
+ Var.PossibleRegisters.end());
+ }
+ }
+
+ std::vector<llvm::MCInst> Pattern;
+ for (int A = 0; A < MaxAssignments; ++A) {
+ // FIXME: This is a bit pessimistic. We should get away with an
+ // assignment that ensures that the set of assigned registers for uses and
+ // the set of assigned registers for defs do not intersect (registers
+ // for uses (resp defs) do not have to be all distinct).
+ const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
+ if (Regs.empty())
+ break;
+ // Remove all assigned registers defs that are used by at least one other
+ // variable from the list of possible variable registers. This ensures that
+ // we never create a RAW hazard that would lead to serialization.
+ for (size_t I = 0, E = Vars.size(); I < E; ++I) {
+ llvm::MCPhysReg Reg = Regs[I];
+ if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
+ Vars[I].PossibleRegisters.remove(Reg);
+ }
+ }
+ // Create an MCInst and check assembly.
+ llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
+ if (!State.canAssemble(Inst))
+ continue;
+ Pattern.push_back(std::move(Inst));
+ }
+ return Pattern;
+}
+
UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
-llvm::Expected<std::vector<llvm::MCInst>>
-UopsBenchmarkRunner::createSnippet(RegisterAliasingTrackerCache &RATC,
- unsigned Opcode,
- llvm::raw_ostream &Info) const {
- std::vector<llvm::MCInst> Snippet;
- const llvm::MCInstrDesc &MCInstrDesc = MCInstrInfo.get(Opcode);
- const Instruction Instruction(MCInstrDesc, RATC);
-
- std::string Error;
- if (isInfeasible(Instruction, Error)) {
- llvm::report_fatal_error(llvm::Twine("Infeasible : ").concat(Error));
+llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
+ const LLVMState &State, const unsigned OpcodeIndex,
+ const unsigned NumRepetitions, const JitFunctionContext &Context) const {
+ const auto &InstrInfo = State.getInstrInfo();
+ const auto &RegInfo = State.getRegInfo();
+ const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
+ for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
+ if (isInvalidOperand(OpInfo))
+ return makeError("Only registers and immediates are supported");
}
- const AliasingConfigurations SelfAliasing(Instruction, Instruction);
- if (SelfAliasing.empty()) {
- Info << "instruction is parallel, repeating a random one.\n";
- Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
- return Snippet;
+ // FIXME: Load constants into registers (e.g. with fld1) to not break
+ // instructions like x87.
+
+ // Ideally we would like the only limitation on executing uops to be the issue
+ // ports. Maximizing port pressure increases the likelihood that the load is
+ // distributed evenly across possible ports.
+
+ // To achieve that, one approach is to generate instructions that do not have
+ // data dependencies between them.
+ //
+ // For some instructions, this is trivial:
+ // mov rax, qword ptr [rsi]
+ // mov rax, qword ptr [rsi]
+ // mov rax, qword ptr [rsi]
+ // mov rax, qword ptr [rsi]
+ // For the above snippet, haswell just renames rax four times and executes the
+ // four instructions two at a time on P23 and P0126.
+ //
+ // For some instructions, we just need to make sure that the source is
+ // different from the destination. For example, IDIV8r reads from GPR and
+ // writes to AX. We just need to ensure that the variable is assigned a
+ // register which is different from AX:
+ // idiv bx
+ // idiv bx
+ // idiv bx
+ // idiv bx
+ // The above snippet will be able to fully saturate the ports, while the same
+ // with ax would issue one uop every `latency(IDIV8r)` cycles.
+ //
+ // Some instructions make this harder because they both read and write from
+ // the same register:
+ // inc rax
+ // inc rax
+ // inc rax
+ // inc rax
+ // This has a data dependency from each instruction to the next, limit the
+ // number of instructions that can be issued in parallel.
+ // It turns out that this is not a big issue on recent Intel CPUs because they
+ // have heuristics to balance port pressure. In the snippet above, subsequent
+ // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
+ // might end up executing them all on P0 (just because they can), or try
+ // avoiding P5 because it's usually under high pressure from vector
+ // instructions.
+ // This issue is even more important for high-latency instructions because
+ // they increase the idle time of the CPU, e.g. :
+ // imul rax, rbx
+ // imul rax, rbx
+ // imul rax, rbx
+ // imul rax, rbx
+ //
+ // To avoid that, we do the renaming statically by generating as many
+ // independent exclusive assignments as possible (until all possible registers
+ // are exhausted) e.g.:
+ // imul rax, rbx
+ // imul rcx, rbx
+ // imul rdx, rbx
+ // imul r8, rbx
+ //
+ // Some instruction even make the above static renaming impossible because
+ // they implicitly read and write from the same operand, e.g. ADC16rr reads
+ // and writes from EFLAGS.
+ // In that case we just use a greedy register assignment and hope for the
+ // best.
+
+ const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
+
+ // Generate as many independent exclusive assignments as possible.
+ constexpr const int MaxStaticRenames = 20;
+ std::vector<llvm::MCInst> Pattern =
+ generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
+ if (Pattern.empty()) {
+ // We don't even have a single exclusive assignment, fallback to a greedy
+ // assignment.
+ // FIXME: Tell the user about this decision to help debugging.
+ const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
+ if (!Vars.empty() && Regs.empty())
+ return makeError("No feasible greedy assignment");
+ llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
+ if (!State.canAssemble(Inst))
+ return makeError("Cannot assemble greedy assignment");
+ Pattern.push_back(std::move(Inst));
}
- if (SelfAliasing.hasImplicitAliasing()) {
- Info << "instruction is serial, repeating a random one.\n";
- Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
- return Snippet;
- }
- const auto TiedVariables = getTiedVariables(Instruction);
- if (!TiedVariables.empty()) {
- if (TiedVariables.size() > 1) {
- Info << "Not yet implemented, don't know how to handle several tied "
- "variables\n";
- return makeError("Infeasible : don't know how to handle several tied "
- "variables");
- }
- Info << "instruction has tied variables using static renaming.\n";
- Variable *Var = TiedVariables.front();
- assert(Var);
- assert(!Var->TiedOperands.empty());
- const Operand &Operand = *Var->TiedOperands.front();
- assert(Operand.Tracker);
- for (const llvm::MCPhysReg Reg : Operand.Tracker->sourceBits().set_bits()) {
- clearVariableAssignments(Instruction);
- Var->AssignedValue = llvm::MCOperand::createReg(Reg);
- Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
- }
- return Snippet;
- }
- // No tied variables, we pick random values for defs.
- llvm::BitVector Defs(MCRegisterInfo.getNumRegs());
- for (const auto &Op : Instruction.Operands) {
- if (Op.Tracker && Op.IsExplicit && Op.IsDef) {
- assert(Op.Var);
- auto PossibleRegisters = Op.Tracker->sourceBits();
- remove(PossibleRegisters, RATC.reservedRegisters());
- assert(PossibleRegisters.any() && "No register left to choose from");
- const auto RandomReg = randomBit(PossibleRegisters);
- Defs.set(RandomReg);
- Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg);
- }
- }
- // And pick random use values that are not reserved and don't alias with defs.
- const auto DefAliases = getAliasedBits(MCRegisterInfo, Defs);
- for (const auto &Op : Instruction.Operands) {
- if (Op.Tracker && Op.IsExplicit && !Op.IsDef) {
- assert(Op.Var);
- auto PossibleRegisters = Op.Tracker->sourceBits();
- remove(PossibleRegisters, RATC.reservedRegisters());
- remove(PossibleRegisters, DefAliases);
- assert(PossibleRegisters.any() && "No register left to choose from");
- const auto RandomReg = randomBit(PossibleRegisters);
- Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg);
- }
- }
- Info
- << "instruction has no tied variables picking Uses different from defs\n";
- Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
- return Snippet;
+
+ // Generate repetitions of the pattern until benchmark_iterations is reached.
+ std::vector<llvm::MCInst> Result;
+ Result.reserve(NumRepetitions);
+ for (unsigned I = 0; I < NumRepetitions; ++I)
+ Result.push_back(Pattern[I % Pattern.size()]);
+ return Result;
}
std::vector<BenchmarkMeasure>
-UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
+UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
+ const JitFunction &Function,
const unsigned NumRepetitions) const {
const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
@@ -234,11 +197,7 @@ UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
continue;
// FIXME: Sum results when there are several counters for a single ProcRes
// (e.g. P23 on SandyBridge).
- pfm::PerfEvent UopPerfEvent(PfmCounters);
- if (!UopPerfEvent.valid())
- llvm::report_fatal_error(
- llvm::Twine("invalid perf event ").concat(PfmCounters));
- pfm::Counter Counter(UopPerfEvent);
+ pfm::Counter Counter{pfm::PerfEvent(PfmCounters)};
Counter.start();
Function();
Counter.stop();
OpenPOWER on IntegriCloud