diff options
author | Amara Emerson <aemerson@apple.com> | 2019-07-24 22:17:31 +0000 |
---|---|---|
committer | Amara Emerson <aemerson@apple.com> | 2019-07-24 22:17:31 +0000 |
commit | 13af1ed8e37c3f53b4dbd03f7c40b069d4895087 (patch) | |
tree | 2b005088e9b02aa4743ac52ce5b94cf34a99c6ab /llvm/lib | |
parent | eb6782758a54f200a1a5e531bef28192833d2584 (diff) | |
download | bcm5719-llvm-13af1ed8e37c3f53b4dbd03f7c40b069d4895087.tar.gz bcm5719-llvm-13af1ed8e37c3f53b4dbd03f7c40b069d4895087.zip |
[GlobalISel] Support for inlining memcpy, memset and memmove calls.
This introduces a new family of combiner helper routines that re-use the
target specific cost model from SelectionDAG, and generate inline implementations
of the memcpy family of intrinsics.
The combines are only enabled at optimization levels higher than -O0, and give
very substantial performance improvements.
Differential Revision: https://reviews.llvm.org/D65167
llvm-svn: 366951
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp | 505 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 59 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64ISelLowering.h | 8 | ||||
-rw-r--r-- | llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp | 22 | ||||
-rw-r--r-- | llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp | 3 |
5 files changed, 593 insertions, 4 deletions
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index 9cbf3dd83ff..1ae454f4bd4 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -10,9 +10,12 @@ #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "gi-combiner" @@ -404,6 +407,508 @@ bool CombinerHelper::tryCombineBr(MachineInstr &MI) { return true; } +static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { + // On Darwin, -Os means optimize for size without hurting performance, so + // only really optimize for size when -Oz (MinSize) is used. + if (MF.getTarget().getTargetTriple().isOSDarwin()) + return MF.getFunction().hasMinSize(); + return MF.getFunction().hasOptSize(); +} + +// Get a rough equivalent of an MVT for a given LLT. +static MVT getMVTForLLT(LLT Ty) { + if (!Ty.isVector()) + return MVT::getIntegerVT(Ty.getSizeInBits()); + + return MVT::getVectorVT( + MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), + Ty.getNumElements()); +} + +// Returns a list of types to use for memory op lowering in MemOps. A partial +// port of findOptimalMemOpLowering in TargetLowering. +static bool findGISelOptimalMemOpLowering( + std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + bool AllowOverlap, unsigned DstAS, unsigned SrcAS, + const AttributeList &FuncAttributes, const TargetLowering &TLI) { + // If 'SrcAlign' is zero, that means the memory operation does not need to + // load the value, i.e. memset or memcpy from constant string. Otherwise, + // it's the inferred alignment of the source. 'DstAlign', on the other hand, + // is the specified alignment of the memory operation. If it is zero, that + // means it's possible to change the alignment of the destination. + // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does + // not need to be loaded. + if (SrcAlign != 0 && SrcAlign < DstAlign) + return false; + + LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset, + ZeroMemset, MemcpyStrSrc, FuncAttributes); + + if (Ty == LLT()) { + // Use the largest scalar type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + Ty = LLT::scalar(64); + while (DstAlign && DstAlign < Ty.getSizeInBytes() && + !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign)) + Ty = LLT::scalar(Ty.getSizeInBytes()); + assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); + // FIXME: check for the largest legal type we can load/store to. + } + + unsigned NumMemOps = 0; + while (Size != 0) { + unsigned TySize = Ty.getSizeInBytes(); + while (TySize > Size) { + // For now, only use non-vector load / store's for the left-over pieces. + LLT NewTy = Ty; + // FIXME: check for mem op safety and legality of the types. Not all of + // SDAGisms map cleanly to GISel concepts. + if (NewTy.isVector()) + NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); + unsigned NewTySize = NewTy.getSizeInBytes(); + + NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits()-1)); + NewTySize = NewTy.getSizeInBytes(); + assert(NewTySize > 0 && "Could not find appropriate type"); + + // If the new LLT cannot cover all of the remaining bits, then consider + // issuing a (or a pair of) unaligned and overlapping load / store. + bool Fast; + // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). + MVT VT = getMVTForLLT(Ty); + if (NumMemOps && AllowOverlap && NewTySize < Size && + TLI.allowsMisalignedMemoryAccesses( + VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) && + Fast) + TySize = Size; + else { + Ty = NewTy; + TySize = NewTySize; + } + } + + if (++NumMemOps > Limit) + return false; + + MemOps.push_back(Ty); + Size -= TySize; + } + + return true; +} + +static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { + if (Ty.isVector()) + return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), + Ty.getNumElements()); + return IntegerType::get(C, Ty.getSizeInBits()); +} + +// Get a vectorized representation of the memset value operand, GISel edition. +static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NumBits = Ty.getScalarSizeInBits(); + auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); + if (!Ty.isVector() && ValVRegAndVal) { + unsigned KnownVal = ValVRegAndVal->Value; + APInt Scalar = APInt(8, KnownVal); + APInt SplatVal = APInt::getSplat(NumBits, Scalar); + return MIB.buildConstant(Ty, SplatVal).getReg(0); + } + // FIXME: for vector types create a G_BUILD_VECTOR. + if (Ty.isVector()) + return Register(); + + // Extend the byte value to the larger type, and then multiply by a magic + // value 0x010101... in order to replicate it across every byte. + LLT ExtType = Ty.getScalarType(); + auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); + if (NumBits > 8) { + APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); + auto MagicMI = MIB.buildConstant(ExtType, Magic); + Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); + } + + assert(ExtType == Ty && "Vector memset value type not supported yet"); + return Val; +} + +bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val, + unsigned KnownLen, unsigned Align, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + assert(KnownLen != 0 && "Have a zero length memset length!"); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); + std::vector<LLT> MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + + auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); + bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; + + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0, + /*IsMemset=*/true, + /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u, + MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + MachineIRBuilder MIB(MI); + // Find the largest store and generate the bit pattern for it. + LLT LargestTy = MemOps[0]; + for (unsigned i = 1; i < MemOps.size(); i++) + if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) + LargestTy = MemOps[i]; + + // The memset stored value is always defined as an s8, so in order to make it + // work with larger store types we need to repeat the bit pattern across the + // wider type. + Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); + + if (!MemSetValue) + return false; + + // Generate the stores. For each store type in the list, we generate the + // matching store of that type to the destination address. + LLT PtrTy = MRI.getType(Dst); + unsigned DstOff = 0; + unsigned Size = KnownLen; + for (unsigned I = 0; I < MemOps.size(); I++) { + LLT Ty = MemOps[I]; + unsigned TySize = Ty.getSizeInBytes(); + if (TySize > Size) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + assert(I == MemOps.size() - 1 && I != 0); + DstOff -= TySize - Size; + } + + // If this store is smaller than the largest store see whether we can get + // the smaller value for free with a truncate. + Register Value = MemSetValue; + if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { + MVT VT = getMVTForLLT(Ty); + MVT LargestVT = getMVTForLLT(LargestTy); + if (!LargestTy.isVector() && !Ty.isVector() && + TLI.isTruncateFree(LargestVT, VT)) + Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); + else + Value = getMemsetValue(Val, Ty, MIB); + if (!Value) + return false; + } + + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes()); + + Register Ptr = Dst; + if (DstOff != 0) { + auto Offset = + MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); + Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + } + + MIB.buildStore(Value, Ptr, *StoreMMO); + DstOff += Ty.getSizeInBytes(); + Size -= TySize; + } + + MI.eraseFromParent(); + return true; +} + + +bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, + Register Src, unsigned KnownLen, + unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + assert(KnownLen != 0 && "Have a zero length memcpy length!"); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + unsigned Align = MinAlign(DstAlign, SrcAlign); + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + // FIXME: infer better src pointer alignment like SelectionDAG does here. + // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining + // if the memcpy is in a tail call position. + + unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize); + std::vector<LLT> MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + const auto &SrcMMO = **std::next(MI.memoperands_begin()); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); + + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, + /*IsMemset=*/false, + /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), + SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + + // Don't promote to an alignment that would require dynamic stack + // realignment. + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (!TRI->needsStackRealignment(MF)) + while (NewAlign > Align && + DL.exceedsNaturalStackAlignment(NewAlign)) + NewAlign /= 2; + + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); + + MachineIRBuilder MIB(MI); + // Now we need to emit a pair of load and stores for each of the types we've + // collected. I.e. for each type, generate a load from the source pointer of + // that type width, and then generate a corresponding store to the dest buffer + // of that value loaded. This can result in a sequence of loads and stores + // mixed types, depending on what the target specifies as good types to use. + unsigned CurrOffset = 0; + LLT PtrTy = MRI.getType(Src); + unsigned Size = KnownLen; + for (auto CopyTy : MemOps) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + if (CopyTy.getSizeInBytes() > Size) + CurrOffset -= CopyTy.getSizeInBytes() - Size; + + // Construct MMOs for the accesses. + auto *LoadMMO = + MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); + + // Create the load. + Register LoadPtr = Src; + Register Offset; + if (CurrOffset != 0) { + Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) + .getReg(0); + LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); + } + auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); + + // Create the store. + Register StorePtr = + CurrOffset == 0 ? Dst : MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + MIB.buildStore(LdVal, StorePtr, *StoreMMO); + CurrOffset += CopyTy.getSizeInBytes(); + Size -= CopyTy.getSizeInBytes(); + } + + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, + Register Src, unsigned KnownLen, + unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + assert(KnownLen != 0 && "Have a zero length memmove length!"); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + unsigned Align = MinAlign(DstAlign, SrcAlign); + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); + std::vector<LLT> MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + const auto &SrcMMO = **std::next(MI.memoperands_begin()); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); + + // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due + // to a bug in it's findOptimalMemOpLowering implementation. For now do the + // same thing here. + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, + /*IsMemset=*/false, + /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(), + SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + + // Don't promote to an alignment that would require dynamic stack + // realignment. + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (!TRI->needsStackRealignment(MF)) + while (NewAlign > Align && + DL.exceedsNaturalStackAlignment(NewAlign)) + NewAlign /= 2; + + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); + + MachineIRBuilder MIB(MI); + // Memmove requires that we perform the loads first before issuing the stores. + // Apart from that, this loop is pretty much doing the same thing as the + // memcpy codegen function. + unsigned CurrOffset = 0; + LLT PtrTy = MRI.getType(Src); + SmallVector<Register, 16> LoadVals; + for (auto CopyTy : MemOps) { + // Construct MMO for the load. + auto *LoadMMO = + MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); + + // Create the load. + Register LoadPtr = Src; + if (CurrOffset != 0) { + auto Offset = + MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); + LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); + } + LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); + CurrOffset += CopyTy.getSizeInBytes(); + } + + CurrOffset = 0; + for (unsigned I = 0; I < MemOps.size(); ++I) { + LLT CopyTy = MemOps[I]; + // Now store the values loaded. + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); + + Register StorePtr = Dst; + if (CurrOffset != 0) { + auto Offset = + MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset); + StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + } + MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); + CurrOffset += CopyTy.getSizeInBytes(); + } + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) { + // This combine is fairly complex so it's not written with a separate + // matcher function. + assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); + Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID(); + assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove || + ID == Intrinsic::memset) && + "Expected a memcpy like intrinsic"); + + auto MMOIt = MI.memoperands_begin(); + const MachineMemOperand *MemOp = *MMOIt; + bool IsVolatile = MemOp->isVolatile(); + // Don't try to optimize volatile. + if (IsVolatile) + return false; + + unsigned DstAlign = MemOp->getBaseAlignment(); + unsigned SrcAlign = 0; + unsigned Dst = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + Register Len = MI.getOperand(3).getReg(); + + if (ID != Intrinsic::memset) { + assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); + MemOp = *(++MMOIt); + SrcAlign = MemOp->getBaseAlignment(); + } + + // See if this is a constant length copy + auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); + if (!LenVRegAndVal) + return false; // Leave it to the legalizer to lower it to a libcall. + unsigned KnownLen = LenVRegAndVal->Value; + + if (KnownLen == 0) { + MI.eraseFromParent(); + return true; + } + + if (ID == Intrinsic::memcpy) + return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); + if (ID == Intrinsic::memmove) + return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); + if (ID == Intrinsic::memset) + return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile); + return false; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7becc99fb5c..b78837aae83 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1100,6 +1100,32 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( return true; } +// Same as above but handling LLTs instead. +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + if (Fast) { + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || + Ty.getSizeInBytes() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + Ty == LLT::vector(2, 64); + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -8739,6 +8765,39 @@ EVT AArch64TargetLowering::getOptimalMemOpType( return MVT::Other; } +LLT AArch64TargetLowering::getOptimalMemOpLLT( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; + }; + + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return LLT::vector(2, 64); + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return LLT::scalar(128); + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + return LLT::scalar(64); + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + return LLT::scalar(32); + return LLT(); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits<int64_t>::min()) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 4421c31f65c..34e1fdf441e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -272,6 +272,10 @@ public: EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; + /// LLT variant. + bool allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -358,6 +362,10 @@ public: bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const override; + /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 5f7245bfbd7..5ec209ada17 100644 --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -28,9 +28,9 @@ using namespace MIPatternMatch; namespace { class AArch64PreLegalizerCombinerInfo : public CombinerInfo { public: - AArch64PreLegalizerCombinerInfo() + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize) {} virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; @@ -51,6 +51,18 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: return Helper.tryCombineExtendingLoads(MI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // Try to inline memcpy type calls if optimizations are enabled. + return (EnableOpt && !EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI) + : false; + } + default: + break; + } } return false; @@ -89,7 +101,11 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { MachineFunctionProperties::Property::FailedISel)) return false; auto *TPC = &getAnalysis<TargetPassConfig>(); - AArch64PreLegalizerCombinerInfo PCInfo; + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize()); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp index 85076590d40..ace0735652b 100644 --- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -27,7 +27,8 @@ class MipsPreLegalizerCombinerInfo : public CombinerInfo { public: MipsPreLegalizerCombinerInfo() : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, + /*EnableOptSize*/ false, /*EnableMinSize*/ false) {} virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; |