diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-06-27 20:32:13 +0000 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2016-06-27 20:32:13 +0000 |
commit | 03d8584590c647add646df7557b2e448633ce77c (patch) | |
tree | f3b960436a3e4bc947e267a97ca8644a6d098a4a | |
parent | 5cdf699daafe87163242a1cc9b4109fd3cb576ff (diff) | |
download | bcm5719-llvm-03d8584590c647add646df7557b2e448633ce77c.tar.gz bcm5719-llvm-03d8584590c647add646df7557b2e448633ce77c.zip |
AMDGPU: Move subtarget feature checks into passes
llvm-svn: 273937
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPU.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 6 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 1 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h | 5 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 44 | ||||
-rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/captured-frame-index.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/extload-private.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll | 3 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/structurize1.ll | 2 |
11 files changed, 46 insertions, 38 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 7736fd6c4cf..607e8d9bfdd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -241,12 +241,6 @@ def FeatureEnableUnsafeDSOffsetFolding : SubtargetFeature < "Force using DS instruction immediate offsets on SI" >; -def FeatureIfCvt : SubtargetFeature <"disable-ifcvt", - "EnableIfCvt", - "false", - "Disable the if conversion pass" ->; - def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler", "EnableSIScheduler", "true", diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 17b45fa65f1..fa8709e4f2b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -124,6 +124,10 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { if (!TM || skipFunction(F)) return false; + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + if (!ST.isPromoteAllocaEnabled()) + return false; + FunctionType *FTy = F.getFunctionType(); // If the function has any arguments in the local address space, then it's @@ -139,8 +143,6 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) { } } - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); - LocalMemLimit = ST.getLocalMemorySize(); if (LocalMemLimit == 0) return false; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index e973f8e4837..39032b682e1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -105,7 +105,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, EnableVGPRSpilling(false), EnablePromoteAlloca(false), - EnableIfCvt(true), EnableLoadStoreOpt(false), EnableUnsafeDSOffsetFolding(false), EnableSIScheduler(false), diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 53117e3cb60..9a0adf1b166 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -82,7 +82,6 @@ protected: // Used as options. bool EnableVGPRSpilling; bool EnablePromoteAlloca; - bool EnableIfCvt; bool EnableLoadStoreOpt; bool EnableUnsafeDSOffsetFolding; bool EnableSIScheduler; @@ -222,10 +221,6 @@ public: return EnablePromoteAlloca; } - bool isIfCvtEnabled() const { - return EnableIfCvt; - } - bool unsafeDSOffsetFoldingEnabled() const { return EnableUnsafeDSOffsetFolding; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 54a28fde83f..162bbc2f91c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -45,6 +45,18 @@ static cl::opt<bool> EnableR600StructurizeCFG( cl::desc("Use StructurizeCFG IR pass"), cl::init(true)); +static cl::opt<bool> EnableSROA( + "amdgpu-sroa", + cl::desc("Run SROA after promote alloca pass"), + cl::ReallyHidden, + cl::init(true)); + +static cl::opt<bool> EnableR600IfConvert( + "r600-if-convert", + cl::desc("Use if conversion pass"), + cl::ReallyHidden, + cl::init(true)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine<R600TargetMachine> X(TheAMDGPUTarget); @@ -212,12 +224,7 @@ public: } ScheduleDAGInstrs * - createMachineScheduler(MachineSchedContext *C) const override { - const SISubtarget *ST = getGCNTargetMachine().getSubtargetImpl(); - if (ST->enableSIScheduler()) - return createSIMachineScheduler(C); - return nullptr; - } + createMachineScheduler(MachineSchedContext *C) const override; bool addPreISel() override; void addMachineSSAOptimization() override; @@ -285,10 +292,11 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createAMDGPUOpenCLImageTypeLoweringPass()); const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine(); - const AMDGPUSubtarget &ST = *TM.getSubtargetImpl(); - if (TM.getOptLevel() > CodeGenOpt::None && ST.isPromoteAllocaEnabled()) { + if (TM.getOptLevel() > CodeGenOpt::None) { addPass(createAMDGPUPromoteAlloca(&TM)); - addPass(createSROAPass()); + + if (EnableSROA) + addPass(createSROAPass()); } addStraightLineScalarOptimizationPasses(); @@ -344,9 +352,8 @@ void R600PassConfig::addPreRegAlloc() { } void R600PassConfig::addPreSched2() { - const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); addPass(createR600EmitClauseMarkers(), false); - if (ST.isIfCvtEnabled()) + if (EnableR600IfConvert) addPass(&IfConverterID, false); addPass(createR600ClauseMergePass(*TM), false); } @@ -367,6 +374,14 @@ TargetPassConfig *R600TargetMachine::createPassConfig(PassManagerBase &PM) { // GCN Pass Setup //===----------------------------------------------------------------------===// +ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler( + MachineSchedContext *C) const { + const SISubtarget &ST = C->MF->getSubtarget<SISubtarget>(); + if (ST.enableSIScheduler()) + return createSIMachineScheduler(C); + return nullptr; +} + bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); @@ -415,8 +430,6 @@ bool GCNPassConfig::addRegBankSelect() { #endif void GCNPassConfig::addPreRegAlloc() { - const SISubtarget &ST = *getGCNTargetMachine().getSubtargetImpl(); - // This needs to be run directly before register allocation because // earlier passes might recompute live intervals. // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass @@ -424,15 +437,18 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID); } - if (getOptLevel() > CodeGenOpt::None && ST.loadStoreOptEnabled()) { + if (getOptLevel() > CodeGenOpt::None) { // Don't do this with no optimizations since it throws away debug info by // merging nonadjacent loads. // This should be run after scheduling, but before register allocation. It // also need extra copies to the address operand to be eliminated. + + // FIXME: Move pre-RA and remove extra reg coalescer run. insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); insertPass(&MachineSchedulerID, &RegisterCoalescerID); } + addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); } diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 0b9b29a54b5..9e972a569a0 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -412,6 +412,9 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { return false; const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + if (!STM.loadStoreOptEnabled()) + return false; + TII = STM.getInstrInfo(); TRI = &TII->getRegisterInfo(); diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index 978b6da7b7c..161c46b486e 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}stored_fi_to_lds: ; GCN: s_load_dword [[LDSPTR:s[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index ae419a6a353..a0857273e3e 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -1,9 +1,9 @@ ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; OPT-LABEL: @test_sink_global_small_offset_i32( ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll index 294c3a9c678..3f27370d703 100644 --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s -; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: ; SI: buffer_load_sbyte v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen @@ -39,7 +39,7 @@ entry: define void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16 - %tmp1 = load i16, i16* %tmp0 + %tmp1 = load volatile i16, i16* %tmp0 %tmp2 = zext i16 %tmp1 to i32 store i32 %tmp2, i32 addrspace(1)* %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll index f32b044198a..ea943a533c8 100644 --- a/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll +++ b/llvm/test/CodeGen/AMDGPU/parallelandifcollapse.ll @@ -1,5 +1,4 @@ -; Function Attrs: nounwind -; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca < %s | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s ; ; CFG flattening should use parallel-and mode to generate branch conditions and ; then merge if-regions with the same bodies. diff --git a/llvm/test/CodeGen/AMDGPU/structurize1.ll b/llvm/test/CodeGen/AMDGPU/structurize1.ll index 77432c1f9d2..db0f50247e3 100644 --- a/llvm/test/CodeGen/AMDGPU/structurize1.ll +++ b/llvm/test/CodeGen/AMDGPU/structurize1.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=r600 -mattr=disable-ifcvt -mcpu=redwood | FileCheck %s +; RUN: llc -march=r600 -mcpu=redwood -r600-if-convert=0 < %s | FileCheck %s ; This tests for abug where the AMDILCFGStructurizer was crashing on loops ; like this: |