diff options
Diffstat (limited to 'llvm/tools/llvm-exegesis/lib/Uops.cpp')
-rw-r--r-- | llvm/tools/llvm-exegesis/lib/Uops.cpp | 351 |
1 files changed, 196 insertions, 155 deletions
diff --git a/llvm/tools/llvm-exegesis/lib/Uops.cpp b/llvm/tools/llvm-exegesis/lib/Uops.cpp index 59d2fc408fb..94282862989 100644 --- a/llvm/tools/llvm-exegesis/lib/Uops.cpp +++ b/llvm/tools/llvm-exegesis/lib/Uops.cpp @@ -8,183 +8,220 @@ //===----------------------------------------------------------------------===// #include "Uops.h" -#include "BenchmarkResult.h" -#include "InstructionSnippetGenerator.h" + +#include "Assembler.h" +#include "BenchmarkRunner.h" +#include "MCInstrDescView.h" #include "PerfHelper.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/MC/MCInstrDesc.h" -#include "llvm/MC/MCSchedule.h" -#include "llvm/Support/Error.h" -#include <algorithm> -#include <random> -#include <unordered_map> -#include <unordered_set> + +// FIXME: Load constants into registers (e.g. with fld1) to not break +// instructions like x87. + +// Ideally we would like the only limitation on executing uops to be the issue +// ports. Maximizing port pressure increases the likelihood that the load is +// distributed evenly across possible ports. + +// To achieve that, one approach is to generate instructions that do not have +// data dependencies between them. +// +// For some instructions, this is trivial: +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// mov rax, qword ptr [rsi] +// For the above snippet, haswell just renames rax four times and executes the +// four instructions two at a time on P23 and P0126. +// +// For some instructions, we just need to make sure that the source is +// different from the destination. For example, IDIV8r reads from GPR and +// writes to AX. We just need to ensure that the Var is assigned a +// register which is different from AX: +// idiv bx +// idiv bx +// idiv bx +// idiv bx +// The above snippet will be able to fully saturate the ports, while the same +// with ax would issue one uop every `latency(IDIV8r)` cycles. +// +// Some instructions make this harder because they both read and write from +// the same register: +// inc rax +// inc rax +// inc rax +// inc rax +// This has a data dependency from each instruction to the next, limit the +// number of instructions that can be issued in parallel. +// It turns out that this is not a big issue on recent Intel CPUs because they +// have heuristics to balance port pressure. In the snippet above, subsequent +// instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs +// might end up executing them all on P0 (just because they can), or try +// avoiding P5 because it's usually under high pressure from vector +// instructions. +// This issue is even more important for high-latency instructions because +// they increase the idle time of the CPU, e.g. : +// imul rax, rbx +// imul rax, rbx +// imul rax, rbx +// imul rax, rbx +// +// To avoid that, we do the renaming statically by generating as many +// independent exclusive assignments as possible (until all possible registers +// are exhausted) e.g.: +// imul rax, rbx +// imul rcx, rbx +// imul rdx, rbx +// imul r8, rbx +// +// Some instruction even make the above static renaming impossible because +// they implicitly read and write from the same operand, e.g. ADC16rr reads +// and writes from EFLAGS. +// In that case we just use a greedy register assignment and hope for the +// best. namespace exegesis { -// FIXME: Handle memory (see PR36906) -static bool isInvalidOperand(const llvm::MCOperandInfo &OpInfo) { - switch (OpInfo.OperandType) { - default: - return true; - case llvm::MCOI::OPERAND_IMMEDIATE: - case llvm::MCOI::OPERAND_REGISTER: - return false; - } +static bool hasUnknownOperand(const llvm::MCOperandInfo &OpInfo) { + return OpInfo.OperandType == llvm::MCOI::OPERAND_UNKNOWN; } -static llvm::Error makeError(llvm::Twine Msg) { - return llvm::make_error<llvm::StringError>(Msg, - llvm::inconvertibleErrorCode()); +// FIXME: Handle memory, see PR36905. +static bool hasMemoryOperand(const llvm::MCOperandInfo &OpInfo) { + return OpInfo.OperandType == llvm::MCOI::OPERAND_MEMORY; } -static std::vector<llvm::MCInst> generateIndependentAssignments( - const LLVMState &State, const llvm::MCInstrDesc &InstrDesc, - llvm::SmallVector<Variable, 8> Vars, int MaxAssignments) { - std::unordered_set<llvm::MCPhysReg> IsUsedByAnyVar; - for (const Variable &Var : Vars) { - if (Var.IsUse) { - IsUsedByAnyVar.insert(Var.PossibleRegisters.begin(), - Var.PossibleRegisters.end()); - } +static bool isInfeasible(const Instruction &Instruction, std::string &Error) { + const auto &MCInstrDesc = Instruction.Description; + if (MCInstrDesc.isPseudo()) { + Error = "is pseudo"; + return true; } + if (llvm::any_of(MCInstrDesc.operands(), hasUnknownOperand)) { + Error = "has unknown operands"; + return true; + } + if (llvm::any_of(MCInstrDesc.operands(), hasMemoryOperand)) { + Error = "has memory operands"; + return true; + } + return false; +} - std::vector<llvm::MCInst> Pattern; - for (int A = 0; A < MaxAssignments; ++A) { - // FIXME: This is a bit pessimistic. We should get away with an - // assignment that ensures that the set of assigned registers for uses and - // the set of assigned registers for defs do not intersect (registers - // for uses (resp defs) do not have to be all distinct). - const std::vector<llvm::MCPhysReg> Regs = getExclusiveAssignment(Vars); - if (Regs.empty()) - break; - // Remove all assigned registers defs that are used by at least one other - // variable from the list of possible variable registers. This ensures that - // we never create a RAW hazard that would lead to serialization. - for (size_t I = 0, E = Vars.size(); I < E; ++I) { - llvm::MCPhysReg Reg = Regs[I]; - if (Vars[I].IsDef && IsUsedByAnyVar.count(Reg)) { - Vars[I].PossibleRegisters.remove(Reg); - } - } - // Create an MCInst and check assembly. - llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs); - if (!State.canAssemble(Inst)) - continue; - Pattern.push_back(std::move(Inst)); +// Returns whether this Variable ties Use and Def operands together. +static bool hasTiedOperands(const Variable *Var) { + bool HasUse = false; + bool HasDef = false; + for (const Operand *Op : Var->TiedOperands) { + if (Op->IsDef) + HasDef = true; + else + HasUse = true; } - return Pattern; + return HasUse && HasDef; +} + +static llvm::SmallVector<Variable *, 8> +getTiedVariables(const Instruction &Instruction) { + llvm::SmallVector<Variable *, 8> Result; + for (auto *Var : Instruction.Variables) + if (hasTiedOperands(Var)) + Result.push_back(Var); + return Result; +} + +static void remove(llvm::BitVector &a, const llvm::BitVector &b) { + assert(a.size() == b.size()); + for (auto I : b.set_bits()) + a.reset(I); +} + +static llvm::Error makeError(llvm::Twine Msg) { + return llvm::make_error<llvm::StringError>(Msg, + llvm::inconvertibleErrorCode()); } UopsBenchmarkRunner::~UopsBenchmarkRunner() = default; const char *UopsBenchmarkRunner::getDisplayName() const { return "uops"; } -llvm::Expected<std::vector<llvm::MCInst>> UopsBenchmarkRunner::createCode( - const LLVMState &State, const unsigned OpcodeIndex, - const unsigned NumRepetitions, const JitFunctionContext &Context) const { - const auto &InstrInfo = State.getInstrInfo(); - const auto &RegInfo = State.getRegInfo(); - const llvm::MCInstrDesc &InstrDesc = InstrInfo.get(OpcodeIndex); - for (const llvm::MCOperandInfo &OpInfo : InstrDesc.operands()) { - if (isInvalidOperand(OpInfo)) - return makeError("Only registers and immediates are supported"); +llvm::Expected<std::vector<llvm::MCInst>> +UopsBenchmarkRunner::createSnippet(RegisterAliasingTrackerCache &RATC, + unsigned Opcode, + llvm::raw_ostream &Info) const { + std::vector<llvm::MCInst> Snippet; + const llvm::MCInstrDesc &MCInstrDesc = MCInstrInfo.get(Opcode); + const Instruction Instruction(MCInstrDesc, RATC); + + std::string Error; + if (isInfeasible(Instruction, Error)) { + llvm::report_fatal_error(llvm::Twine("Infeasible : ").concat(Error)); } - // FIXME: Load constants into registers (e.g. with fld1) to not break - // instructions like x87. - - // Ideally we would like the only limitation on executing uops to be the issue - // ports. Maximizing port pressure increases the likelihood that the load is - // distributed evenly across possible ports. - - // To achieve that, one approach is to generate instructions that do not have - // data dependencies between them. - // - // For some instructions, this is trivial: - // mov rax, qword ptr [rsi] - // mov rax, qword ptr [rsi] - // mov rax, qword ptr [rsi] - // mov rax, qword ptr [rsi] - // For the above snippet, haswell just renames rax four times and executes the - // four instructions two at a time on P23 and P0126. - // - // For some instructions, we just need to make sure that the source is - // different from the destination. For example, IDIV8r reads from GPR and - // writes to AX. We just need to ensure that the variable is assigned a - // register which is different from AX: - // idiv bx - // idiv bx - // idiv bx - // idiv bx - // The above snippet will be able to fully saturate the ports, while the same - // with ax would issue one uop every `latency(IDIV8r)` cycles. - // - // Some instructions make this harder because they both read and write from - // the same register: - // inc rax - // inc rax - // inc rax - // inc rax - // This has a data dependency from each instruction to the next, limit the - // number of instructions that can be issued in parallel. - // It turns out that this is not a big issue on recent Intel CPUs because they - // have heuristics to balance port pressure. In the snippet above, subsequent - // instructions will end up evenly distributed on {P0,P1,P5,P6}, but some CPUs - // might end up executing them all on P0 (just because they can), or try - // avoiding P5 because it's usually under high pressure from vector - // instructions. - // This issue is even more important for high-latency instructions because - // they increase the idle time of the CPU, e.g. : - // imul rax, rbx - // imul rax, rbx - // imul rax, rbx - // imul rax, rbx - // - // To avoid that, we do the renaming statically by generating as many - // independent exclusive assignments as possible (until all possible registers - // are exhausted) e.g.: - // imul rax, rbx - // imul rcx, rbx - // imul rdx, rbx - // imul r8, rbx - // - // Some instruction even make the above static renaming impossible because - // they implicitly read and write from the same operand, e.g. ADC16rr reads - // and writes from EFLAGS. - // In that case we just use a greedy register assignment and hope for the - // best. - - const auto Vars = getVariables(RegInfo, InstrDesc, Context.getReservedRegs()); - - // Generate as many independent exclusive assignments as possible. - constexpr const int MaxStaticRenames = 20; - std::vector<llvm::MCInst> Pattern = - generateIndependentAssignments(State, InstrDesc, Vars, MaxStaticRenames); - if (Pattern.empty()) { - // We don't even have a single exclusive assignment, fallback to a greedy - // assignment. - // FIXME: Tell the user about this decision to help debugging. - const std::vector<llvm::MCPhysReg> Regs = getGreedyAssignment(Vars); - if (!Vars.empty() && Regs.empty()) - return makeError("No feasible greedy assignment"); - llvm::MCInst Inst = generateMCInst(InstrDesc, Vars, Regs); - if (!State.canAssemble(Inst)) - return makeError("Cannot assemble greedy assignment"); - Pattern.push_back(std::move(Inst)); + const AliasingConfigurations SelfAliasing(Instruction, Instruction); + if (SelfAliasing.empty()) { + Info << "instruction is parallel, repeating a random one.\n"; + Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction)); + return Snippet; } - - // Generate repetitions of the pattern until benchmark_iterations is reached. - std::vector<llvm::MCInst> Result; - Result.reserve(NumRepetitions); - for (unsigned I = 0; I < NumRepetitions; ++I) - Result.push_back(Pattern[I % Pattern.size()]); - return Result; + if (SelfAliasing.hasImplicitAliasing()) { + Info << "instruction is serial, repeating a random one.\n"; + Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction)); + return Snippet; + } + const auto TiedVariables = getTiedVariables(Instruction); + if (!TiedVariables.empty()) { + if (TiedVariables.size() > 1) { + Info << "Not yet implemented, don't know how to handle several tied " + "variables\n"; + return makeError("Infeasible : don't know how to handle several tied " + "variables"); + } + Info << "instruction has tied variables using static renaming.\n"; + Variable *Var = TiedVariables.front(); + assert(Var); + assert(!Var->TiedOperands.empty()); + const Operand &Operand = *Var->TiedOperands.front(); + assert(Operand.Tracker); + for (const llvm::MCPhysReg Reg : Operand.Tracker->sourceBits().set_bits()) { + clearVariableAssignments(Instruction); + Var->AssignedValue = llvm::MCOperand::createReg(Reg); + Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction)); + } + return Snippet; + } + // No tied variables, we pick random values for defs. + llvm::BitVector Defs(MCRegisterInfo.getNumRegs()); + for (const auto &Op : Instruction.Operands) { + if (Op.Tracker && Op.IsExplicit && Op.IsDef) { + assert(Op.Var); + auto PossibleRegisters = Op.Tracker->sourceBits(); + remove(PossibleRegisters, RATC.reservedRegisters()); + assert(PossibleRegisters.any() && "No register left to choose from"); + const auto RandomReg = randomBit(PossibleRegisters); + Defs.set(RandomReg); + Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg); + } + } + // And pick random use values that are not reserved and don't alias with defs. + const auto DefAliases = getAliasedBits(MCRegisterInfo, Defs); + for (const auto &Op : Instruction.Operands) { + if (Op.Tracker && Op.IsExplicit && !Op.IsDef) { + assert(Op.Var); + auto PossibleRegisters = Op.Tracker->sourceBits(); + remove(PossibleRegisters, RATC.reservedRegisters()); + remove(PossibleRegisters, DefAliases); + assert(PossibleRegisters.any() && "No register left to choose from"); + const auto RandomReg = randomBit(PossibleRegisters); + Op.Var->AssignedValue = llvm::MCOperand::createReg(RandomReg); + } + } + Info + << "instruction has no tied variables picking Uses different from defs\n"; + Snippet.push_back(randomizeUnsetVariablesAndBuild(Instruction)); + return Snippet; } std::vector<BenchmarkMeasure> -UopsBenchmarkRunner::runMeasurements(const LLVMState &State, - const JitFunction &Function, +UopsBenchmarkRunner::runMeasurements(const ExecutableFunction &Function, const unsigned NumRepetitions) const { const auto &SchedModel = State.getSubtargetInfo().getSchedModel(); @@ -197,7 +234,11 @@ UopsBenchmarkRunner::runMeasurements(const LLVMState &State, continue; // FIXME: Sum results when there are several counters for a single ProcRes // (e.g. P23 on SandyBridge). - pfm::Counter Counter{pfm::PerfEvent(PfmCounters)}; + pfm::PerfEvent UopPerfEvent(PfmCounters); + if (!UopPerfEvent.valid()) + llvm::report_fatal_error( + llvm::Twine("invalid perf event ").concat(PfmCounters)); + pfm::Counter Counter(UopPerfEvent); Counter.start(); Function(); Counter.stop(); |