diff options
Diffstat (limited to 'llvm/lib/Transforms/Vectorize/VPlanSLP.cpp')
-rw-r--r-- | llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 469 |
1 files changed, 469 insertions, 0 deletions
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp new file mode 100644 index 00000000000..679fb51e48d --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -0,0 +1,469 @@ +//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// This file implements SLP analysis based on VPlan. The analysis is based on +/// the ideas described in +/// +/// Look-ahead SLP: auto-vectorization in the presence of commutative +/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +/// Luís F. W. Góes +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include <cassert> +#include <iterator> +#include <string> +#include <vector> + +using namespace llvm; + +#define DEBUG_TYPE "vplan-slp" + +// Number of levels to look ahead when re-ordering multi node operands. +static unsigned LookaheadMaxDepth = 5; + +VPInstruction *VPlanSlp::markFailed() { + // FIXME: Currently this is used to signal we hit instructions we cannot + // trivially SLP'ize. + CompletelySLP = false; + return nullptr; +} + +void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) { + if (all_of(Operands, [](VPValue *V) { + return cast<VPInstruction>(V)->getUnderlyingInstr(); + })) { + unsigned BundleSize = 0; + for (VPValue *V : Operands) { + Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType(); + assert(!T->isVectorTy() && "Only scalar types supported for now"); + BundleSize += T->getScalarSizeInBits(); + } + WidestBundleBits = std::max(WidestBundleBits, BundleSize); + } + + auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); + assert(Res.second && + "Already created a combined instruction for the operand bundle"); + (void)Res; +} + +bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const { + // Currently we only support VPInstructions. + if (!all_of(Operands, [](VPValue *Op) { + return Op && isa<VPInstruction>(Op) && + cast<VPInstruction>(Op)->getUnderlyingInstr(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); + return false; + } + + // Check if opcodes and type width agree for all instructions in the bundle. + // FIXME: Differing widths/opcodes can be handled by inserting additional + // instructions. + // FIXME: Deal with non-primitive types. + const Instruction *OriginalInstr = + cast<VPInstruction>(Operands[0])->getUnderlyingInstr(); + unsigned Opcode = OriginalInstr->getOpcode(); + unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); + if (!all_of(Operands, [Opcode, Width](VPValue *Op) { + const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr(); + return I->getOpcode() == Opcode && + I->getType()->getPrimitiveSizeInBits() == Width; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); + return false; + } + + // For now, all operands must be defined in the same BB. + if (any_of(Operands, [this](VPValue *Op) { + return cast<VPInstruction>(Op)->getParent() != &this->BB; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); + return false; + } + + if (any_of(Operands, + [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { + LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); + return false; + } + + // For loads, check that there are no instructions writing to memory in + // between them. + // TODO: we only have to forbid instructions writing to memory that could + // interfere with any of the loads in the bundle + if (Opcode == Instruction::Load) { + unsigned LoadsSeen = 0; + VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent(); + for (auto &I : *Parent) { + auto *VPI = cast<VPInstruction>(&I); + if (VPI->getOpcode() == Instruction::Load && + std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + LoadsSeen++; + + if (LoadsSeen == Operands.size()) + break; + if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { + LLVM_DEBUG( + dbgs() << "VPSLP: instruction modifying memory between loads\n"); + return false; + } + } + + if (!all_of(Operands, [](VPValue *Op) { + return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); + return false; + } + } + + if (Opcode == Instruction::Store) + if (!all_of(Operands, [](VPValue *Op) { + return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); + return false; + } + + return true; +} + +static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values, + unsigned OperandIndex) { + SmallVector<VPValue *, 4> Operands; + for (VPValue *V : Values) { + auto *U = cast<VPUser>(V); + Operands.push_back(U->getOperand(OperandIndex)); + } + return Operands; +} + +static bool areCommutative(ArrayRef<VPValue *> Values) { + return Instruction::isCommutative( + cast<VPInstruction>(Values[0])->getOpcode()); +} + +static SmallVector<SmallVector<VPValue *, 4>, 4> +getOperands(ArrayRef<VPValue *> Values) { + SmallVector<SmallVector<VPValue *, 4>, 4> Result; + auto *VPI = cast<VPInstruction>(Values[0]); + + switch (VPI->getOpcode()) { + case Instruction::Load: + llvm_unreachable("Loads terminate a tree, no need to get operands"); + case Instruction::Store: + Result.push_back(getOperands(Values, 0)); + break; + default: + for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) + Result.push_back(getOperands(Values, I)); + break; + } + + return Result; +} + +/// Returns the opcode of Values or ~0 if they do not all agree. +static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) { + unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode(); + if (any_of(Values, [Opcode](VPValue *V) { + return cast<VPInstruction>(V)->getOpcode() != Opcode; + })) + return None; + return {Opcode}; +} + +/// Returns true if A and B access sequential memory if they are loads or +/// stores or if they have identical opcodes otherwise. +static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, + VPInterleavedAccessInfo &IAI) { + if (A->getOpcode() != B->getOpcode()) + return false; + + if (A->getOpcode() != Instruction::Load && + A->getOpcode() != Instruction::Store) + return true; + auto *GA = IAI.getInterleaveGroup(A); + auto *GB = IAI.getInterleaveGroup(B); + + return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); +} + +/// Implements getLAScore from Listing 7 in the paper. +/// Traverses and compares operands of V1 and V2 to MaxLevel. +static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, + VPInterleavedAccessInfo &IAI) { + if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2)) + return 0; + + if (MaxLevel == 0) + return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1), + cast<VPInstruction>(V2), IAI); + + unsigned Score = 0; + for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J) + Score += getLAScore(cast<VPUser>(V1)->getOperand(I), + cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI); + return Score; +} + +std::pair<VPlanSlp::OpMode, VPValue *> +VPlanSlp::getBest(OpMode Mode, VPValue *Last, + SmallVectorImpl<VPValue *> &Candidates, + VPInterleavedAccessInfo &IAI) { + LLVM_DEBUG(dbgs() << " getBest\n"); + VPValue *Best = Candidates[0]; + SmallVector<VPValue *, 4> BestCandidates; + + LLVM_DEBUG(dbgs() << " Candidates for " + << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " "); + for (auto *Candidate : Candidates) { + auto *LastI = cast<VPInstruction>(Last); + auto *CandidateI = cast<VPInstruction>(Candidate); + if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { + LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr() + << " "); + BestCandidates.push_back(Candidate); + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (BestCandidates.empty()) + return {OpMode::Failed, nullptr}; + + if (BestCandidates.size() == 1) + return {Mode, BestCandidates[0]}; + + if (Mode == OpMode::Opcode) { + unsigned BestScore = 0; + for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { + unsigned PrevScore = ~0u; + bool AllSame = true; + + // FIXME: Avoid visiting the same operands multiple times. + for (auto *Candidate : BestCandidates) { + unsigned Score = getLAScore(Last, Candidate, Depth, IAI); + if (PrevScore == ~0u) + PrevScore = Score; + if (PrevScore != Score) + AllSame = false; + PrevScore = Score; + + if (Score > BestScore) { + BestScore = Score; + Best = Candidate; + } + } + if (!AllSame) + break; + } + } + LLVM_DEBUG(dbgs() << "Found best " + << *cast<VPInstruction>(Best)->getUnderlyingInstr() + << "\n"); + std::remove(Candidates.begin(), Candidates.end(), Best); + + return {Mode, Best}; +} + +SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() { + SmallVector<MultiNodeOpTy, 4> FinalOrder; + SmallVector<OpMode, 4> Mode; + FinalOrder.reserve(MultiNodeOps.size()); + Mode.reserve(MultiNodeOps.size()); + + LLVM_DEBUG(dbgs() << "Reordering multinode\n"); + + for (auto &Operands : MultiNodeOps) { + FinalOrder.push_back({Operands.first, {Operands.second[0]}}); + if (cast<VPInstruction>(Operands.second[0])->getOpcode() == + Instruction::Load) + Mode.push_back(OpMode::Load); + else + Mode.push_back(OpMode::Opcode); + } + + for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { + LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); + SmallVector<VPValue *, 4> Candidates; + Candidates.reserve(MultiNodeOps.size()); + LLVM_DEBUG(dbgs() << " Candidates "); + for (auto Ops : MultiNodeOps) { + LLVM_DEBUG( + dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr() + << " "); + Candidates.push_back(Ops.second[Lane]); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { + LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); + if (Mode[Op] == OpMode::Failed) + continue; + + VPValue *Last = FinalOrder[Op].second[Lane - 1]; + std::pair<OpMode, VPValue *> Res = + getBest(Mode[Op], Last, Candidates, IAI); + if (Res.second) + FinalOrder[Op].second.push_back(Res.second); + else + // TODO: handle this case + FinalOrder[Op].second.push_back(markFailed()); + } + } + + return FinalOrder; +} + +void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) { + LLVM_DEBUG(dbgs() << " Ops: "); + for (auto Op : Values) + if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr()) + LLVM_DEBUG(dbgs() << *Instr << " | "); + else + LLVM_DEBUG(dbgs() << " nullptr | "); + LLVM_DEBUG(dbgs() << "\n"); +} + +VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) { + assert(!Values.empty() && "Need some operands!"); + + // If we already visited this instruction bundle, re-use the existing node + auto I = BundleToCombined.find(to_vector<4>(Values)); + if (I != BundleToCombined.end()) { +#ifdef NDEBUG + // Check that the resulting graph is a tree. If we re-use a node, this means + // its values have multiple users. We only allow this, if all users of each + // value are the same instruction. + for (auto *V : Values) { + auto UI = V->user_begin(); + auto *FirstUser = *UI++; + while (UI != V->use_end()) { + assert(*UI == FirstUser && "Currently we only support SLP trees."); + UI++; + } + } +#endif + return I->second; + } + + // Dump inputs + LLVM_DEBUG({ + dbgs() << "buildGraph: "; + dumpBundle(Values); + }); + + if (!areVectorizable(Values)) + return markFailed(); + + assert(getOpcode(Values) && "Opcodes for all values must match"); + unsigned ValuesOpcode = getOpcode(Values).getValue(); + + SmallVector<VPValue *, 4> CombinedOperands; + if (areCommutative(Values)) { + bool MultiNodeRoot = !MultiNodeActive; + MultiNodeActive = true; + for (auto &Operands : getOperands(Values)) { + LLVM_DEBUG({ + dbgs() << " Visiting Commutative"; + dumpBundle(Operands); + }); + + auto OperandsOpcode = getOpcode(Operands); + if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { + LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); + CombinedOperands.push_back(buildGraph(Operands)); + } else { + LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); + // Create dummy VPInstruction, which will we replace later by the + // re-ordered operand. + VPInstruction *Op = new VPInstruction(0, {}); + CombinedOperands.push_back(Op); + MultiNodeOps.emplace_back(Op, Operands); + } + } + + if (MultiNodeRoot) { + LLVM_DEBUG(dbgs() << "Reorder \n"); + MultiNodeActive = false; + + auto FinalOrder = reorderMultiNodeOps(); + + MultiNodeOps.clear(); + for (auto &Ops : FinalOrder) { + VPInstruction *NewOp = buildGraph(Ops.second); + Ops.first->replaceAllUsesWith(NewOp); + for (unsigned i = 0; i < CombinedOperands.size(); i++) + if (CombinedOperands[i] == Ops.first) + CombinedOperands[i] = NewOp; + delete Ops.first; + Ops.first = NewOp; + } + LLVM_DEBUG(dbgs() << "Found final order\n"); + } + } else { + LLVM_DEBUG(dbgs() << " NonCommuntative\n"); + if (ValuesOpcode == Instruction::Load) + for (VPValue *V : Values) + CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0)); + else + for (auto &Operands : getOperands(Values)) + CombinedOperands.push_back(buildGraph(Operands)); + } + + unsigned Opcode; + switch (ValuesOpcode) { + case Instruction::Load: + Opcode = VPInstruction::SLPLoad; + break; + case Instruction::Store: + Opcode = VPInstruction::SLPStore; + break; + default: + Opcode = ValuesOpcode; + break; + } + + if (!CompletelySLP) + return markFailed(); + + assert(CombinedOperands.size() > 0 && "Need more some operands"); + auto *VPI = new VPInstruction(Opcode, CombinedOperands); + VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr()); + + LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); + cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n"); + addCombined(Values, VPI); + return VPI; +} |