1 files changed, 196 insertions, 155 deletions
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp
index 59d2fc408fb..94282862989 100644
--- a/llvm/tools/llvm-exegesis/lib/Uops.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp
@@ -8,183 +8,220 @@
 //===----------------------------------------------------------------------===//
 
 #include "Uops.h"
-#include "BenchmarkResult.h"
-#include "InstructionSnippetGenerator.h"
+
+#include "Assembler.h"
+#include "BenchmarkRunner.h"
+#include "MCInstrDescView.h"
 #include "PerfHelper.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInstrDesc.h"
-#include "llvm/MC/MCSchedule.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <random>
-#include <unordered_map>
-#include <unordered_set>
+
+// FIXME: Load constants into registers (e.g. with fld1) to not break
+// instructions like x87.
+
+// Ideally we would like the only limitation on executing uops to be the issue
+// ports. Maximizing port pressure increases the likelihood that the load is
+// distributed evenly across possible ports.
+
+// To achieve that, one approach is to generate instructions that do not have
+// data dependencies between them.
+//
+// For some instructions, this is trivial:
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+//    mov rax, qword ptr [rsi]
+// For the above snippet, haswell just renames rax four times and executes the
+// four instructions two at a time on P23 and P0126.
+//
+// For some instructions, we just need to make sure that the source is
+// different from the destination. For example, IDIV8r reads from GPR and
+// writes to AX. We just need to ensure that the Var is assigned a
+// register which is different from AX:
+//    idiv bx
+//    idiv bx
+//    idiv bx
+//    idiv bx
+// The above snippet will be able to fully saturate the ports, while the same
+// with ax would issue one uop every `latency(IDIV8r)` cycles.
+//
+// Some instructions make this harder because they both read and write from
+// the same register:
+//    inc rax
+//    inc rax
+//    inc rax
+//    inc rax
+// This has a data dependency from each instruction to the next, limit the
+// number of instructions that can be issued in parallel.
+// It turns out that this is not a big issue on recent Intel CPUs because they
+// have heuristics to balance port pressure. In the snippet above, subsequent
+// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
+// might end up executing them all on P0 (just because they can), or try
+// avoiding P5 because it's usually under high pressure from vector
+// instructions.
+// This issue is even more important for high-latency instructions because
+// they increase the idle time of the CPU, e.g. :
+//    imul rax, rbx
+//    imul rax, rbx
+//    imul rax, rbx
+//    imul rax, rbx
+//
+// To avoid that, we do the renaming statically by generating as many
+// independent exclusive assignments as possible (until all possible registers
+// are exhausted) e.g.:
+//    imul rax, rbx
+//    imul rcx, rbx
+//    imul rdx, rbx
+//    imul r8,  rbx
+//
+// Some instruction even make the above static renaming impossible because
+// they implicitly read and write from the same operand, e.g. ADC16rr reads
+// and writes from EFLAGS.
+// In that case we just use a greedy register assignment and hope for the
+// best.
 
 namespace exegesis {
 
-// FIXME: Handle memory (see PR36906)
-static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) {
-  switch (OpInfo.OperandType) {
-  default:
-    return true;
-  case llvm::MCOI::OPERAND_IMMEDIATE:
-  case llvm::MCOI::OPERAND_REGISTER:
-    return false;
-  }
+static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) {
+  return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN;
 }
 
-static llvm::Error makeError(llvm::Twine Msg) {
-  return llvm::make_error<llvm::StringError>(Msg,
-                                             llvm::inconvertibleErrorCode());
+// FIXME: Handle memory, see PR36905.
+static bool hasMemoryOperand(const llvm::MCOperandInfo &OpInfo) {
+  return OpInfo.OperandType == llvm::MCOI::OPERAND_MEMORY;
 }
 
-static std::vector<llvm::MCInst> generateIndependentAssignments(
-    const LLVMState &State, const llvm::MCInstrDesc &InstrDesc,
-    llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) {
-  std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar;
-  for (const Variable &Var : Vars) {
-    if (Var.IsUse) {
-      IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(),
-                            Var.PossibleRegisters.end());
-    }
+static bool isInfeasible(const Instruction &Instruction, std::string &Error) {
+  const auto &MCInstrDesc = Instruction.Description;
+  if (MCInstrDesc.isPseudo()) {
+    Error = "is pseudo";
+    return true;
   }
+  if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand)) {
+    Error = "has unknown operands";
+    return true;
+  }
+  if (llvm::any_of(MCInstrDesc.operands(), hasMemoryOperand)) {
+    Error = "has memory operands";
+    return true;
+  }
+  return false;
+}
 
-  std::vector<llvm::MCInst> Pattern;
-  for (int A = 0; A < MaxAssignments; ++A) {
-    // FIXME: This is a bit pessimistic. We should get away with an
-    // assignment that ensures that the set of assigned registers for uses and
-    // the set of assigned registers for defs do not intersect (registers
-    // for uses (resp defs) do not have to be all distinct).
-    const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars);
-    if (Regs.empty())
-      break;
-    // Remove all assigned registers defs that are used by at least one other
-    // variable from the list of possible variable registers. This ensures that
-    // we never create a RAW hazard that would lead to serialization.
-    for (size_t I = 0, E = Vars.size(); I < E; ++I) {
-      llvm::MCPhysReg Reg = Regs[I];
-      if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) {
-        Vars[I].PossibleRegisters.remove(Reg);
-      }
-    }
-    // Create an MCInst and check assembly.
-    llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
-    if (!State.canAssemble(Inst))
-      continue;
-    Pattern.push_back(std::move(Inst));
+// Returns whether this Variable ties Use and Def operands together.
+static bool hasTiedOperands(const Variable *Var) {
+  bool HasUse = false;
+  bool HasDef = false;
+  for (const Operand *Op : Var->TiedOperands) {
+    if (Op->IsDef)
+      HasDef = true;
+    else
+      HasUse = true;
   }
-  return Pattern;
+  return HasUse && HasDef;
+}
+
+static llvm::SmallVector<Variable *, 8>
+getTiedVariables(const Instruction &Instruction) {
+  llvm::SmallVector<Variable *, 8> Result;
+  for (auto *Var : Instruction.Variables)
+    if (hasTiedOperands(Var))
+      Result.push_back(Var);
+  return Result;
+}
+
+static void remove(llvm::BitVector &a, const llvm::BitVector &b) {
+  assert(a.size() == b.size());
+  for (auto I : b.set_bits())
+    a.reset(I);
+}
+
+static llvm::Error makeError(llvm::Twine Msg) {
+  return llvm::make_error<llvm::StringError>(Msg,
+                                             llvm::inconvertibleErrorCode());
 }
 
 UopsBenchmarkRunner::~UopsBenchmarkRunner() = default;
 
 const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; }
 
-llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode(
-    const LLVMState &State, const unsigned OpcodeIndex,
-    const unsigned NumRepetitions, const JitFunctionContext &Context) const {
-  const auto &InstrInfo = State.getInstrInfo();
-  const auto &RegInfo = State.getRegInfo();
-  const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex);
-  for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) {
-    if (isInvalidOperand(OpInfo))
-      return makeError("Only registers and immediates are supported");
+llvm::Expected<std::vector<llvm::MCInst>>
+UopsBenchmarkRunner::createSnippet(RegisterAliasingTrackerCache &RATC,
+                                   unsigned Opcode,
+                                   llvm::raw_ostream &Info) const {
+  std::vector<llvm::MCInst> Snippet;
+  const llvm::MCInstrDesc &MCInstrDesc = MCInstrInfo.get(Opcode);
+  const Instruction Instruction(MCInstrDesc, RATC);
+
+  std::string Error;
+  if (isInfeasible(Instruction, Error)) {
+    llvm::report_fatal_error(llvm::Twine("Infeasible : ").concat(Error));
   }
 
-  // FIXME: Load constants into registers (e.g. with fld1) to not break
-  // instructions like x87.
-
-  // Ideally we would like the only limitation on executing uops to be the issue
-  // ports. Maximizing port pressure increases the likelihood that the load is
-  // distributed evenly across possible ports.
-
-  // To achieve that, one approach is to generate instructions that do not have
-  // data dependencies between them.
-  //
-  // For some instructions, this is trivial:
-  //    mov rax, qword ptr [rsi]
-  //    mov rax, qword ptr [rsi]
-  //    mov rax, qword ptr [rsi]
-  //    mov rax, qword ptr [rsi]
-  // For the above snippet, haswell just renames rax four times and executes the
-  // four instructions two at a time on P23 and P0126.
-  //
-  // For some instructions, we just need to make sure that the source is
-  // different from the destination. For example, IDIV8r reads from GPR and
-  // writes to AX. We just need to ensure that the variable is assigned a
-  // register which is different from AX:
-  //    idiv bx
-  //    idiv bx
-  //    idiv bx
-  //    idiv bx
-  // The above snippet will be able to fully saturate the ports, while the same
-  // with ax would issue one uop every `latency(IDIV8r)` cycles.
-  //
-  // Some instructions make this harder because they both read and write from
-  // the same register:
-  //    inc rax
-  //    inc rax
-  //    inc rax
-  //    inc rax
-  // This has a data dependency from each instruction to the next, limit the
-  // number of instructions that can be issued in parallel.
-  // It turns out that this is not a big issue on recent Intel CPUs because they
-  // have heuristics to balance port pressure. In the snippet above, subsequent
-  // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs
-  // might end up executing them all on P0 (just because they can), or try
-  // avoiding P5 because it's usually under high pressure from vector
-  // instructions.
-  // This issue is even more important for high-latency instructions because
-  // they increase the idle time of the CPU, e.g. :
-  //    imul rax, rbx
-  //    imul rax, rbx
-  //    imul rax, rbx
-  //    imul rax, rbx
-  //
-  // To avoid that, we do the renaming statically by generating as many
-  // independent exclusive assignments as possible (until all possible registers
-  // are exhausted) e.g.:
-  //    imul rax, rbx
-  //    imul rcx, rbx
-  //    imul rdx, rbx
-  //    imul r8,  rbx
-  //
-  // Some instruction even make the above static renaming impossible because
-  // they implicitly read and write from the same operand, e.g. ADC16rr reads
-  // and writes from EFLAGS.
-  // In that case we just use a greedy register assignment and hope for the
-  // best.
-
-  const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs());
-
-  // Generate as many independent exclusive assignments as possible.
-  constexpr const int MaxStaticRenames = 20;
-  std::vector<llvm::MCInst> Pattern =
-      generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames);
-  if (Pattern.empty()) {
-    // We don't even have a single exclusive assignment, fallback to a greedy
-    // assignment.
-    // FIXME: Tell the user about this decision to help debugging.
-    const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars);
-    if (!Vars.empty() && Regs.empty())
-      return makeError("No feasible greedy assignment");
-    llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs);
-    if (!State.canAssemble(Inst))
-      return makeError("Cannot assemble greedy assignment");
-    Pattern.push_back(std::move(Inst));
+  const AliasingConfigurations SelfAliasing(Instruction, Instruction);
+  if (SelfAliasing.empty()) {
+    Info << "instruction is parallel, repeating a random one.\n";
+    Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
+    return Snippet;
   }
-
-  // Generate repetitions of the pattern until benchmark_iterations is reached.
-  std::vector<llvm::MCInst> Result;
-  Result.reserve(NumRepetitions);
-  for (unsigned I = 0; I < NumRepetitions; ++I)
-    Result.push_back(Pattern[I % Pattern.size()]);
-  return Result;
+  if (SelfAliasing.hasImplicitAliasing()) {
+    Info << "instruction is serial, repeating a random one.\n";
+    Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
+    return Snippet;
+  }
+  const auto TiedVariables = getTiedVariables(Instruction);
+  if (!TiedVariables.empty()) {
+    if (TiedVariables.size() > 1) {
+      Info << "Not yet implemented, don't know how to handle several tied "
+              "variables\n";
+      return makeError("Infeasible : don't know how to handle several tied "
+                       "variables");
+    }
+    Info << "instruction has tied variables using static renaming.\n";
+    Variable *Var = TiedVariables.front();
+    assert(Var);
+    assert(!Var->TiedOperands.empty());
+    const Operand &Operand = *Var->TiedOperands.front();
+    assert(Operand.Tracker);
+    for (const llvm::MCPhysReg Reg : Operand.Tracker->sourceBits().set_bits()) {
+      clearVariableAssignments(Instruction);
+      Var->AssignedValue = llvm::MCOperand::createReg(Reg);
+      Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
+    }
+    return Snippet;
+  }
+  // No tied variables, we pick random values for defs.
+  llvm::BitVector Defs(MCRegisterInfo.getNumRegs());
+  for (const auto &Op : Instruction.Operands) {
+    if (Op.Tracker && Op.IsExplicit && Op.IsDef) {
+      assert(Op.Var);
+      auto PossibleRegisters = Op.Tracker->sourceBits();
+      remove(PossibleRegisters, RATC.reservedRegisters());
+      assert(PossibleRegisters.any() && "No register left to choose from");
+      const auto RandomReg = randomBit(PossibleRegisters);
+      Defs.set(RandomReg);
+      Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg);
+    }
+  }
+  // And pick random use values that are not reserved and don't alias with defs.
+  const auto DefAliases = getAliasedBits(MCRegisterInfo, Defs);
+  for (const auto &Op : Instruction.Operands) {
+    if (Op.Tracker && Op.IsExplicit && !Op.IsDef) {
+      assert(Op.Var);
+      auto PossibleRegisters = Op.Tracker->sourceBits();
+      remove(PossibleRegisters, RATC.reservedRegisters());
+      remove(PossibleRegisters, DefAliases);
+      assert(PossibleRegisters.any() && "No register left to choose from");
+      const auto RandomReg = randomBit(PossibleRegisters);
+      Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg);
+    }
+  }
+  Info
+      << "instruction has no tied variables picking Uses different from defs\n";
+  Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction));
+  return Snippet;
 }
 
 std::vector<BenchmarkMeasure>
-UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
-                                     const JitFunction &Function,
+UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function,
                                      const unsigned NumRepetitions) const {
   const auto &SchedModel = State.getSubtargetInfo().getSchedModel();
 
@@ -197,7 +234,11 @@ UopsBenchmarkRunner::runMeasurements(const LLVMState &State,
       continue;
     // FIXME: Sum results when there are several counters for a single ProcRes
     // (e.g. P23 on SandyBridge).
-    pfm::Counter Counter{pfm::PerfEvent(PfmCounters)};
+    pfm::PerfEvent UopPerfEvent(PfmCounters);
+    if (!UopPerfEvent.valid())
+      llvm::report_fatal_error(
+          llvm::Twine("invalid perf event ").concat(PfmCounters));
+    pfm::Counter Counter(UopPerfEvent);
     Counter.start();
     Function();
     Counter.stop();