diff options
| -rw-r--r-- | llvm/lib/Target/BPF/BPF.h | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp | 169 | ||||
| -rw-r--r-- | llvm/lib/Target/BPF/BPFMIPeephole.cpp | 189 | ||||
| -rw-r--r-- | llvm/lib/Target/BPF/BPFTargetMachine.cpp | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/BPF/remove_truncate_6.ll | 80 |
5 files changed, 285 insertions, 167 deletions
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h index ba21503d868..6e4f35f4c5d 100644 --- a/llvm/lib/Target/BPF/BPF.h +++ b/llvm/lib/Target/BPF/BPF.h @@ -20,12 +20,14 @@ ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM); FunctionPass *createBPFISelDag(BPFTargetMachine &TM); FunctionPass *createBPFMISimplifyPatchablePass(); FunctionPass *createBPFMIPeepholePass(); +FunctionPass *createBPFMIPeepholeTruncElimPass(); FunctionPass *createBPFMIPreEmitPeepholePass(); FunctionPass *createBPFMIPreEmitCheckingPass(); void initializeBPFAbstractMemberAccessPass(PassRegistry&); void initializeBPFMISimplifyPatchablePass(PassRegistry&); void initializeBPFMIPeepholePass(PassRegistry&); +void initializeBPFMIPeepholeTruncElimPass(PassRegistry&); void initializeBPFMIPreEmitPeepholePass(PassRegistry&); void initializeBPFMIPreEmitCheckingPass(PassRegistry&); } diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp index 85fa1f2a6be..f2be0ff070d 100644 --- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp +++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp @@ -45,9 +45,7 @@ class BPFDAGToDAGISel : public SelectionDAGISel { public: explicit BPFDAGToDAGISel(BPFTargetMachine &TM) - : SelectionDAGISel(TM), Subtarget(nullptr) { - curr_func_ = nullptr; - } + : SelectionDAGISel(TM), Subtarget(nullptr) {} StringRef getPassName() const override { return "BPF DAG->DAG Pattern Instruction Selection"; @@ -92,14 +90,8 @@ private: val_vec_type &Vals, int Offset); bool getConstantFieldValue(const GlobalAddressSDNode *Node, uint64_t Offset, uint64_t Size, unsigned char *ByteSeq); - bool checkLoadDef(unsigned DefReg, unsigned match_load_op); - // Mapping from ConstantStruct global value to corresponding byte-list values std::map<const void *, val_vec_type> cs_vals_; - // Mapping from vreg to load memory opcode - std::map<unsigned, unsigned> load_to_vreg_; - // Current function - const Function *curr_func_; }; } // namespace @@ -325,32 +317,13 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node, } void BPFDAGToDAGISel::PreprocessISelDAG() { - // Iterate through all nodes, interested in the following cases: + // Iterate through all nodes, interested in the following case: // // . loads from ConstantStruct or ConstantArray of constructs // which can be turns into constant itself, with this we can // avoid reading from read-only section at runtime. // - // . reg truncating is often the result of 8/16/32bit->64bit or - // 8/16bit->32bit conversion. If the reg value is loaded with - // masked byte width, the AND operation can be removed since - // BPF LOAD already has zero extension. - // - // This also solved a correctness issue. - // In BPF socket-related program, e.g., __sk_buff->{data, data_end} - // are 32-bit registers, but later on, kernel verifier will rewrite - // it with 64-bit value. Therefore, truncating the value after the - // load will result in incorrect code. - - // clear the load_to_vreg_ map so that we have a clean start - // for this function. - if (!curr_func_) { - curr_func_ = FuncInfo->Fn; - } else if (curr_func_ != FuncInfo->Fn) { - load_to_vreg_.clear(); - curr_func_ = FuncInfo->Fn; - } - + // . Removing redundant AND for intrinsic narrow loads. for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E;) { @@ -358,8 +331,6 @@ void BPFDAGToDAGISel::PreprocessISelDAG() { unsigned Opcode = Node->getOpcode(); if (Opcode == ISD::LOAD) PreprocessLoad(Node, I); - else if (Opcode == ISD::CopyToReg) - PreprocessCopyToReg(Node); else if (Opcode == ISD::AND) PreprocessTrunc(Node, I); } @@ -491,36 +462,6 @@ bool BPFDAGToDAGISel::fillConstantStruct(const DataLayout &DL, return true; } -void BPFDAGToDAGISel::PreprocessCopyToReg(SDNode *Node) { - const RegisterSDNode *RegN = dyn_cast<RegisterSDNode>(Node->getOperand(1)); - if (!RegN || !Register::isVirtualRegister(RegN->getReg())) - return; - - const LoadSDNode *LD = dyn_cast<LoadSDNode>(Node->getOperand(2)); - if (!LD) - return; - - // Assign a load value to a virtual register. record its load width - unsigned mem_load_op = 0; - switch (LD->getMemOperand()->getSize()) { - default: - return; - case 4: - mem_load_op = BPF::LDW; - break; - case 2: - mem_load_op = BPF::LDH; - break; - case 1: - mem_load_op = BPF::LDB; - break; - } - - LLVM_DEBUG(dbgs() << "Find Load Value to VReg " - << Register::virtReg2Index(RegN->getReg()) << '\n'); - load_to_vreg_[RegN->getReg()] = mem_load_op; -} - void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, SelectionDAG::allnodes_iterator &I) { ConstantSDNode *MaskN = dyn_cast<ConstantSDNode>(Node->getOperand(1)); @@ -534,112 +475,26 @@ void BPFDAGToDAGISel::PreprocessTrunc(SDNode *Node, // which the generic optimizer doesn't understand their results are // zero extended. SDValue BaseV = Node->getOperand(0); - if (BaseV.getOpcode() == ISD::INTRINSIC_W_CHAIN) { - unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); - uint64_t MaskV = MaskN->getZExtValue(); - - if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || - (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || - (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) - return; - - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; - Node->dump(); dbgs() << '\n'); - - I--; - CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); - I++; - CurDAG->DeleteNode(Node); - - return; - } - - // Multiple basic blocks case. - if (BaseV.getOpcode() != ISD::CopyFromReg) + if (BaseV.getOpcode() != ISD::INTRINSIC_W_CHAIN) return; - unsigned match_load_op = 0; - switch (MaskN->getZExtValue()) { - default: - return; - case 0xFFFFFFFF: - match_load_op = BPF::LDW; - break; - case 0xFFFF: - match_load_op = BPF::LDH; - break; - case 0xFF: - match_load_op = BPF::LDB; - break; - } + unsigned IntNo = cast<ConstantSDNode>(BaseV->getOperand(1))->getZExtValue(); + uint64_t MaskV = MaskN->getZExtValue(); - const RegisterSDNode *RegN = - dyn_cast<RegisterSDNode>(BaseV.getNode()->getOperand(1)); - if (!RegN || !Register::isVirtualRegister(RegN->getReg())) + if (!((IntNo == Intrinsic::bpf_load_byte && MaskV == 0xFF) || + (IntNo == Intrinsic::bpf_load_half && MaskV == 0xFFFF) || + (IntNo == Intrinsic::bpf_load_word && MaskV == 0xFFFFFFFF))) return; - unsigned AndOpReg = RegN->getReg(); - LLVM_DEBUG(dbgs() << "Examine " << printReg(AndOpReg) << '\n'); - - // Examine the PHI insns in the MachineBasicBlock to found out the - // definitions of this virtual register. At this stage (DAG2DAG - // transformation), only PHI machine insns are available in the machine basic - // block. - MachineBasicBlock *MBB = FuncInfo->MBB; - MachineInstr *MII = nullptr; - for (auto &MI : *MBB) { - for (unsigned i = 0; i < MI.getNumOperands(); ++i) { - const MachineOperand &MOP = MI.getOperand(i); - if (!MOP.isReg() || !MOP.isDef()) - continue; - Register Reg = MOP.getReg(); - if (Register::isVirtualRegister(Reg) && Reg == AndOpReg) { - MII = &MI; - break; - } - } - } - - if (MII == nullptr) { - // No phi definition in this block. - if (!checkLoadDef(AndOpReg, match_load_op)) - return; - } else { - // The PHI node looks like: - // %2 = PHI %0, <%bb.1>, %1, <%bb.3> - // Trace each incoming definition, e.g., (%0, %bb.1) and (%1, %bb.3) - // The AND operation can be removed if both %0 in %bb.1 and %1 in - // %bb.3 are defined with a load matching the MaskN. - LLVM_DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n'); - unsigned PrevReg = -1; - for (unsigned i = 0; i < MII->getNumOperands(); ++i) { - const MachineOperand &MOP = MII->getOperand(i); - if (MOP.isReg()) { - if (MOP.isDef()) - continue; - PrevReg = MOP.getReg(); - if (!Register::isVirtualRegister(PrevReg)) - return; - if (!checkLoadDef(PrevReg, match_load_op)) - return; - } - } - } - LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; Node->dump(); - dbgs() << '\n'); + LLVM_DEBUG(dbgs() << "Remove the redundant AND operation in: "; + Node->dump(); dbgs() << '\n'); I--; CurDAG->ReplaceAllUsesWith(SDValue(Node, 0), BaseV); I++; CurDAG->DeleteNode(Node); -} - -bool BPFDAGToDAGISel::checkLoadDef(unsigned DefReg, unsigned match_load_op) { - auto it = load_to_vreg_.find(DefReg); - if (it == load_to_vreg_.end()) - return false; // The definition of register is not exported yet. - return it->second == match_load_op; + return; } FunctionPass *llvm::createBPFISelDag(BPFTargetMachine &TM) { diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp index fafd2f703ad..72fd131c459 100644 --- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp +++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp @@ -71,7 +71,7 @@ void BPFMIPeephole::initialize(MachineFunction &MFParm) { MF = &MFParm; MRI = &MF->getRegInfo(); TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); - LLVM_DEBUG(dbgs() << "*** BPF MachineSSA peephole pass ***\n\n"); + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA ZEXT Elim peephole pass ***\n\n"); } bool BPFMIPeephole::isMovFrom32Def(MachineInstr *MovMI) @@ -186,7 +186,8 @@ bool BPFMIPeephole::eliminateZExtSeq(void) { } // end default namespace INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE, - "BPF MachineSSA Peephole Optimization", false, false) + "BPF MachineSSA Peephole Optimization For ZEXT Eliminate", + false, false) char BPFMIPeephole::ID = 0; FunctionPass* llvm::createBPFMIPeepholePass() { return new BPFMIPeephole(); } @@ -253,12 +254,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) { // enabled. The special type cast insn MOV_32_64 involves different // register class on src (i32) and dst (i64), RA could generate useless // instruction due to this. - if (MI.getOpcode() == BPF::MOV_32_64) { + unsigned Opcode = MI.getOpcode(); + if (Opcode == BPF::MOV_32_64 || + Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) { Register dst = MI.getOperand(0).getReg(); - Register dst_sub = TRI->getSubReg(dst, BPF::sub_32); Register src = MI.getOperand(1).getReg(); - if (dst_sub != src) + if (Opcode == BPF::MOV_32_64) + dst = TRI->getSubReg(dst, BPF::sub_32); + + if (dst != src) continue; ToErase = &MI; @@ -281,3 +286,177 @@ FunctionPass* llvm::createBPFMIPreEmitPeepholePass() { return new BPFMIPreEmitPeephole(); } + +STATISTIC(TruncElemNum, "Number of truncation eliminated"); + +namespace { + +struct BPFMIPeepholeTruncElim : public MachineFunctionPass { + + static char ID; + const BPFInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + BPFMIPeepholeTruncElim() : MachineFunctionPass(ID) { + initializeBPFMIPeepholeTruncElimPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize class variables. + void initialize(MachineFunction &MFParm); + + bool eliminateTruncSeq(void); + +public: + + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + + initialize(MF); + + return eliminateTruncSeq(); + } +}; + +static bool TruncSizeCompatible(int TruncSize, unsigned opcode) +{ + if (TruncSize == 1) + return opcode == BPF::LDB || opcode == BPF::LDB32; + + if (TruncSize == 2) + return opcode == BPF::LDH || opcode == BPF::LDH32; + + if (TruncSize == 4) + return opcode == BPF::LDW || opcode == BPF::LDW32; + + return false; +} + +// Initialize class variables. +void BPFMIPeepholeTruncElim::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo(); + LLVM_DEBUG(dbgs() << "*** BPF MachineSSA TRUNC Elim peephole pass ***\n\n"); +} + +// Reg truncating is often the result of 8/16/32bit->64bit or +// 8/16bit->32bit conversion. If the reg value is loaded with +// masked byte width, the AND operation can be removed since +// BPF LOAD already has zero extension. +// +// This also solved a correctness issue. +// In BPF socket-related program, e.g., __sk_buff->{data, data_end} +// are 32-bit registers, but later on, kernel verifier will rewrite +// it with 64-bit value. Therefore, truncating the value after the +// load will result in incorrect code. +bool BPFMIPeepholeTruncElim::eliminateTruncSeq(void) { + MachineInstr* ToErase = nullptr; + bool Eliminated = false; + + for (MachineBasicBlock &MBB : *MF) { + for (MachineInstr &MI : MBB) { + // The second insn to remove if the eliminate candidate is a pair. + MachineInstr *MI2 = nullptr; + Register DstReg, SrcReg; + MachineInstr *DefMI; + int TruncSize = -1; + + // If the previous instruction was marked for elimination, remove it now. + if (ToErase) { + ToErase->eraseFromParent(); + ToErase = nullptr; + } + + // AND A, 0xFFFFFFFF will be turned into SLL/SRL pair due to immediate + // for BPF ANDI is i32, and this case only happens on ALU64. + if (MI.getOpcode() == BPF::SRL_ri && + MI.getOperand(2).getImm() == 32) { + SrcReg = MI.getOperand(1).getReg(); + MI2 = MRI->getVRegDef(SrcReg); + DstReg = MI.getOperand(0).getReg(); + + if (!MI2 || + MI2->getOpcode() != BPF::SLL_ri || + MI2->getOperand(2).getImm() != 32) + continue; + + // Update SrcReg. + SrcReg = MI2->getOperand(1).getReg(); + DefMI = MRI->getVRegDef(SrcReg); + if (DefMI) + TruncSize = 4; + } else if (MI.getOpcode() == BPF::AND_ri || + MI.getOpcode() == BPF::AND_ri_32) { + SrcReg = MI.getOperand(1).getReg(); + DstReg = MI.getOperand(0).getReg(); + DefMI = MRI->getVRegDef(SrcReg); + + if (!DefMI) + continue; + + int64_t imm = MI.getOperand(2).getImm(); + if (imm == 0xff) + TruncSize = 1; + else if (imm == 0xffff) + TruncSize = 2; + } + + if (TruncSize == -1) + continue; + + // The definition is PHI node, check all inputs. + if (DefMI->isPHI()) { + bool CheckFail = false; + + for (unsigned i = 1, e = DefMI->getNumOperands(); i < e; i += 2) { + MachineOperand &opnd = DefMI->getOperand(i); + if (!opnd.isReg()) { + CheckFail = true; + break; + } + + MachineInstr *PhiDef = MRI->getVRegDef(opnd.getReg()); + if (!PhiDef || PhiDef->isPHI() || + !TruncSizeCompatible(TruncSize, PhiDef->getOpcode())) { + CheckFail = true; + break; + } + } + + if (CheckFail) + continue; + } else if (!TruncSizeCompatible(TruncSize, DefMI->getOpcode())) { + continue; + } + + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::MOV_rr), DstReg) + .addReg(SrcReg); + + if (MI2) + MI2->eraseFromParent(); + + // Mark it to ToErase, and erase in the next iteration. + ToErase = &MI; + TruncElemNum++; + Eliminated = true; + } + } + + return Eliminated; +} + +} // end default namespace + +INITIALIZE_PASS(BPFMIPeepholeTruncElim, "bpf-mi-trunc-elim", + "BPF MachineSSA Peephole Optimization For TRUNC Eliminate", + false, false) + +char BPFMIPeepholeTruncElim::ID = 0; +FunctionPass* llvm::createBPFMIPeepholeTruncElimPass() +{ + return new BPFMIPeepholeTruncElim(); +} diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp index d940ac965c0..0c4f2c74e7a 100644 --- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp +++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp @@ -36,6 +36,7 @@ extern "C" void LLVMInitializeBPFTarget() { PassRegistry &PR = *PassRegistry::getPassRegistry(); initializeBPFAbstractMemberAccessPass(PR); initializeBPFMIPeepholePass(PR); + initializeBPFMIPeepholeTruncElimPass(PR); } // DataLayout: little or big endian @@ -115,15 +116,16 @@ void BPFPassConfig::addMachineSSAOptimization() { TargetPassConfig::addMachineSSAOptimization(); const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); - if (Subtarget->getHasAlu32() && !DisableMIPeephole) - addPass(createBPFMIPeepholePass()); + if (!DisableMIPeephole) { + if (Subtarget->getHasAlu32()) + addPass(createBPFMIPeepholePass()); + addPass(createBPFMIPeepholeTruncElimPass()); + } } void BPFPassConfig::addPreEmitPass() { - const BPFSubtarget *Subtarget = getBPFTargetMachine().getSubtargetImpl(); - addPass(createBPFMIPreEmitCheckingPass()); if (getOptLevel() != CodeGenOpt::None) - if (Subtarget->getHasAlu32() && !DisableMIPeephole) + if (!DisableMIPeephole) addPass(createBPFMIPreEmitPeepholePass()); } diff --git a/llvm/test/CodeGen/BPF/remove_truncate_6.ll b/llvm/test/CodeGen/BPF/remove_truncate_6.ll new file mode 100644 index 00000000000..6577afb13ba --- /dev/null +++ b/llvm/test/CodeGen/BPF/remove_truncate_6.ll @@ -0,0 +1,80 @@ +; RUN: llc < %s -march=bpf -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=bpf -mattr=+alu32 -verify-machineinstrs | FileCheck --check-prefix=CHECK-32 %s +; +; void cal1(unsigned short *a, unsigned long *b, unsigned int k) +; { +; unsigned short e; +; +; e = *a; +; for (unsigned int i = 0; i < k; i++) { +; b[i] = e; +; e = ~e; +; } +; } +; +; void cal2(unsigned short *a, unsigned int *b, unsigned int k) +; { +; unsigned short e; +; +; e = *a; +; for (unsigned int i = 0; i < k; i++) { +; b[i] = e; +; e = ~e; +; } +; } + +; Function Attrs: nofree norecurse nounwind optsize +define dso_local void @cal1(i16* nocapture readonly %a, i64* nocapture %b, i32 %k) local_unnamed_addr #0 { +entry: + %cmp8 = icmp eq i32 %k, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = load i16, i16* %a, align 2 + %wide.trip.count = zext i32 %k to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ] + %conv = zext i16 %e.09 to i64 + %arrayidx = getelementptr inbounds i64, i64* %b, i64 %indvars.iv +; CHECK: r{{[0-9]+}} &= 65535 +; CHECK-32: r{{[0-9]+}} &= 65535 + store i64 %conv, i64* %arrayidx, align 8 + %neg = xor i16 %e.09, -1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +; Function Attrs: nofree norecurse nounwind optsize +define dso_local void @cal2(i16* nocapture readonly %a, i32* nocapture %b, i32 %k) local_unnamed_addr #0 { +entry: + %cmp8 = icmp eq i32 %k, 0 + br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %0 = load i16, i16* %a, align 2 + %wide.trip.count = zext i32 %k to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body, %for.body.preheader + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %e.09 = phi i16 [ %0, %for.body.preheader ], [ %neg, %for.body ] + %conv = zext i16 %e.09 to i32 + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv +; CHECK: r{{[0-9]+}} &= 65535 +; CHECK-32: w{{[0-9]+}} &= 65535 + store i32 %conv, i32* %arrayidx, align 4 + %neg = xor i16 %e.09, -1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} |

