diff options
-rw-r--r-- | llvm/include/llvm/Target/TargetSubtargetInfo.h | 10 | ||||
-rw-r--r-- | llvm/lib/CodeGen/RegisterCoalescer.cpp | 17 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMMachineFunctionInfo.h | 14 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.cpp | 51 | ||||
-rw-r--r-- | llvm/lib/Target/ARM/ARMSubtarget.h | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/out-of-registers.ll | 42 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/vector-spilling.ll | 34 | ||||
-rw-r--r-- | llvm/test/CodeGen/ARM/vldm-sched-a9.ll | 8 |
8 files changed, 180 insertions, 4 deletions
diff --git a/llvm/include/llvm/Target/TargetSubtargetInfo.h b/llvm/include/llvm/Target/TargetSubtargetInfo.h index bbb83efc780..e2aea45f4c1 100644 --- a/llvm/include/llvm/Target/TargetSubtargetInfo.h +++ b/llvm/include/llvm/Target/TargetSubtargetInfo.h @@ -115,6 +115,16 @@ public: /// \brief Reset the features for the subtarget. virtual void resetSubtargetFeatures(const MachineFunction *MF) { } + + /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true. + virtual bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const + { return true; } + }; } // End llvm namespace diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 5aaeb874d68..0bda4c79987 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1037,6 +1037,23 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) { return false; } + if (CP.getNewRC()) { + const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>(); + auto SrcRC = MRI->getRegClass(CP.getSrcReg()); + auto DstRC = MRI->getRegClass(CP.getDstReg()); + unsigned SrcIdx = CP.getSrcIdx(); + unsigned DstIdx = CP.getDstIdx(); + if (CP.isFlipped()) { + std::swap(SrcIdx, DstIdx); + std::swap(SrcRC, DstRC); + } + if (!ST.shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx, + CP.getNewRC())) { + DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n"); + return false; + } + } + // Dead code elimination. This really should be handled by MachineDCE, but // sometimes dead copies slip through, and we can't generate invalid live // ranges. diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h index 44a9e3495b9..d3fabc3ebb0 100644 --- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h +++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/DenseMap.h" namespace llvm { @@ -118,6 +119,10 @@ class ARMFunctionInfo : public MachineFunctionInfo { /// being passed on the stack unsigned ArgumentStackSize; + /// CoalescedWeights - mapping of basic blocks to the rolling counter of + /// coalesced weights. + DenseMap<const MachineBasicBlock*, unsigned> CoalescedWeights; + public: ARMFunctionInfo() : isThumb(false), @@ -221,6 +226,15 @@ public: else return -1U; } + + DenseMap<const MachineBasicBlock*, unsigned>::iterator getCoalescedWeight( + MachineBasicBlock* MBB) { + auto It = CoalescedWeights.find(MBB); + if (It == CoalescedWeights.end()) { + It = CoalescedWeights.insert(std::make_pair(MBB, 0)).first; + } + return It; + } }; } // End llvm namespace diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp index f21413b33ef..0c6ff529653 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.cpp +++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -18,6 +18,7 @@ #include "ARMJITInfo.h" #include "ARMSelectionDAGInfo.h" #include "ARMSubtarget.h" +#include "ARMMachineFunctionInfo.h" #include "Thumb1FrameLowering.h" #include "Thumb1InstrInfo.h" #include "Thumb2InstrInfo.h" @@ -27,6 +28,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetOptions.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" using namespace llvm; @@ -449,3 +452,51 @@ bool ARMSubtarget::useMovt(const MachineFunction &MF) const { !MF.getFunction()->getAttributes().hasAttribute( AttributeSet::FunctionIndex, Attribute::MinSize)); } + +bool ARMSubtarget::shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const { + auto MBB = MI->getParent(); + auto MF = MBB->getParent(); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + // If not copying into a sub-register this should be ok because we shouldn't + // need to split the reg. + if (!DstSubReg) + return true; + // Small registers don't frequently cause a problem, so we can coalesce them. + if (NewRC->getSize() < 32 && DstRC->getSize() < 32 && SrcRC->getSize() < 32) + return true; + + auto NewRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(NewRC); + auto SrcRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(SrcRC); + auto DstRCWeight = + MRI.getTargetRegisterInfo()->getRegClassWeight(DstRC); + // If the source register class is more expensive than the destination, the + // coalescing is probably profitable. + if (SrcRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + if (DstRCWeight.RegWeight > NewRCWeight.RegWeight) + return true; + + // If the register allocator isn't constrained, we can always allow coalescing + // unfortunately we don't know yet if we will be constrained. + // The goal of this heuristic is to restrict how many expensive registers + // we allow to coalesce in a given basic block. + auto AFI = MF->getInfo<ARMFunctionInfo>(); + auto It = AFI->getCoalescedWeight(MBB); + + DEBUG(dbgs() << "\tARM::shouldCoalesce - Coalesced Weight: " << It->second << "\n"); + DEBUG(dbgs() << "\tARM::shouldCoalesce - Reg Weight: " << NewRCWeight.RegWeight << "\n"); + unsigned SizeMultiplier = MBB->size()/100; + SizeMultiplier = SizeMultiplier ? SizeMultiplier : 1; + if (It->second < NewRCWeight.WeightLimit * SizeMultiplier) { + It->second += NewRCWeight.RegWeight; + return true; + } + return false; +} diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h index 44d2159cb54..626bb0e7860 100644 --- a/llvm/lib/Target/ARM/ARMSubtarget.h +++ b/llvm/lib/Target/ARM/ARMSubtarget.h @@ -451,6 +451,14 @@ public: /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect /// symbol. bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; + + /// \brief SrcRC and DstRC will be morphed into NewRC if this returns true + bool shouldCoalesce(MachineInstr *MI, + const TargetRegisterClass *SrcRC, + unsigned SubReg, + const TargetRegisterClass *DstRC, + unsigned DstSubReg, + const TargetRegisterClass *NewRC) const override; }; } // End llvm namespace diff --git a/llvm/test/CodeGen/ARM/out-of-registers.ll b/llvm/test/CodeGen/ARM/out-of-registers.ll new file mode 100644 index 00000000000..790e4165d4c --- /dev/null +++ b/llvm/test/CodeGen/ARM/out-of-registers.ll @@ -0,0 +1,42 @@ +; RUN: llc -O3 %s -o - | FileCheck %s +; ModuleID = 'fo.c' +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n8:16:32-S64" +target triple = "thumbv7-none-linux-gnueabi" + +; CHECK: vpush +; CHECK: vpop + +define void @foo(float* nocapture %A) #0 { + %1= bitcast float* %A to i8* + %2 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4) + %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 0 + %divp_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %3 + %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 1 + %div3p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %4 + %5 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 2 + %div8p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %5 + %6 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %2, 3 + %div13p_vec = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %6 + tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %divp_vec, <4 x float> %div3p_vec, <4 x float> %div8p_vec, <4 x float> %div13p_vec, i32 4) + ret void +} + +; Function Attrs: nounwind +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 + +; Function Attrs: nounwind readonly + +; Function Attrs: nounwind +declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) #1 +declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32) #2 + +; Function Attrs: nounwind + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"Snapdragon LLVM ARM Compiler 3.4"} +!1 = metadata !{metadata !1} diff --git a/llvm/test/CodeGen/ARM/vector-spilling.ll b/llvm/test/CodeGen/ARM/vector-spilling.ll new file mode 100644 index 00000000000..746c6dfcd11 --- /dev/null +++ b/llvm/test/CodeGen/ARM/vector-spilling.ll @@ -0,0 +1,34 @@ +; RUN: llc < %s -march=arm -mtriple=armv7-linux-gnueabihf -arm-atomic-cfg-tidy=0 -float-abi=hard -mcpu=cortex-a9 -O3 | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64" + +; This test will generate spills/fills using vldmia instructions that access 24 bytes of memory. +; Check that we don't crash when we generate these instructions on Cortex-A9. + +; CHECK: test: +; CHECK: vstmia +; CHECK: vldmia +define void @test(<8 x i64>* %src) #0 { +entry: + %0 = getelementptr inbounds <8 x i64>* %src, i32 0 + %1 = load <8 x i64>* %0, align 8 + + %2 = getelementptr inbounds <8 x i64>* %src, i32 1 + %3 = load <8 x i64>* %2, align 8 + + %4 = getelementptr inbounds <8 x i64>* %src, i32 2 + %5 = load <8 x i64>* %4, align 8 + + %6 = getelementptr inbounds <8 x i64>* %src, i32 3 + %7 = load <8 x i64>* %6, align 8 + + %8 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> + %9 = shufflevector <8 x i64> %1, <8 x i64> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> + + tail call void(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)* @foo(<8 x i64> %1, <8 x i64> %3, <8 x i64> %5, <8 x i64> %7, <8 x i64> %8, <8 x i64> %9) + ret void +} + +declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>) + +attributes #0 = { noredzone "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll index f2e5eb9b7e0..64f3770e3d2 100644 --- a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll +++ b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll @@ -2,12 +2,12 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32-S64" -; This test will generate spills/fills using vldmia instructions that access 64 bytes of memory. -; Check that we don't crash when we generate these instructions on Cortex-A9. +; This test used to test vector spilling using vstmia/vldmia instructions, but +; the changes for PR:18825 prevent that spilling. ; CHECK: test: -; CHECK: vstmia -; CHECK: vldmia +; CHECK-NOT: vstmia +; CHECK-NOT: vldmia define void @test(i64* %src) #0 { entry: %arrayidx39 = getelementptr inbounds i64* %src, i32 13 |