diff options
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 247 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/ds_write2.ll | 8 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/fceil64.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-local-i16.ll | 54 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/load-local-i32.ll | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/store-v3i64.ll | 3 | ||||
| -rw-r--r-- | llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll | 6 |
12 files changed, 209 insertions, 150 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index c8900fffc44..a86603a11ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -497,6 +497,7 @@ void GCNPassConfig::addMachineSSAOptimization() { // XXX - Can we get away without running DeadMachineInstructionElim again? addPass(&SIFoldOperandsID); addPass(&DeadMachineInstructionElimID); + addPass(&SILoadStoreOptimizerID); } void GCNPassConfig::addIRPasses() { @@ -533,17 +534,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() { #endif void GCNPassConfig::addPreRegAlloc() { - if (getOptLevel() > CodeGenOpt::None) { - // Don't do this with no optimizations since it throws away debug info by - // merging nonadjacent loads. - - // This should be run after scheduling, but before register allocation. It - // also need extra copies to the address operand to be eliminated. - - // FIXME: Move pre-RA and remove extra reg coalescer run. - insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID); - insertPass(&MachineSchedulerID, &RegisterCoalescerID); - } addPass(createSIShrinkInstructionsPass()); addPass(createSIWholeQuadModePass()); diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index eec9f98d12c..d80b157d8a4 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -60,31 +60,35 @@ private: const SIInstrInfo *TII; const SIRegisterInfo *TRI; MachineRegisterInfo *MRI; - LiveIntervals *LIS; + AliasAnalysis *AA; static bool offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned EltSize); - MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize); + MachineBasicBlock::iterator findMatchingDSInst( + MachineBasicBlock::iterator I, + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove); MachineBasicBlock::iterator mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); MachineBasicBlock::iterator mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize); + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove); public: static char ID; SILoadStoreOptimizer() : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr), - LIS(nullptr) {} + AA(nullptr) {} SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); @@ -98,17 +102,9 @@ public: return "SI Load / Store Optimizer"; } - MachineFunctionProperties getRequiredProperties() const override { - return MachineFunctionProperties().set( - MachineFunctionProperties::Property::NoPHIs); - } - void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addPreserved<SlotIndexes>(); - AU.addPreserved<LiveIntervals>(); - AU.addPreserved<LiveVariables>(); - AU.addRequired<LiveIntervals>(); + AU.addRequired<AAResultsWrapperPass>(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -118,9 +114,7 @@ public: INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(LiveVariables) -INITIALIZE_PASS_DEPENDENCY(SlotIndexes) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE, "SI Load / Store Optimizer", false, false) @@ -132,6 +126,40 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) { return new SILoadStoreOptimizer(TM); } +static void moveInstsAfter(MachineBasicBlock::iterator I, + ArrayRef<MachineInstr*> InstsToMove) { + MachineBasicBlock *MBB = I->getParent(); + ++I; + for (MachineInstr *MI : InstsToMove) { + MI->removeFromParent(); + MBB->insert(I, MI); + } +} + +static void addDefsToList(const MachineInstr &MI, + SmallVectorImpl<const MachineOperand *> &Defs) { + for (const MachineOperand &Def : MI.defs()) { + Defs.push_back(&Def); + } +} + +static bool +canMoveInstsAcrossMemOp(MachineInstr &MemOp, + ArrayRef<MachineInstr*> InstsToMove, + const SIInstrInfo *TII, + AliasAnalysis *AA) { + + assert(MemOp.mayLoadOrStore()); + + for (MachineInstr *InstToMove : InstsToMove) { + if (!InstToMove->mayLoadOrStore()) + continue; + if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA)) + return false; + } + return true; +} + bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, unsigned Offset1, unsigned Size) { @@ -161,44 +189,98 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0, MachineBasicBlock::iterator SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I, - unsigned EltSize){ + unsigned EltSize, + SmallVectorImpl<MachineInstr*> &InstsToMove) { MachineBasicBlock::iterator E = I->getParent()->end(); - MachineBasicBlock &MBB = *I->getParent(); MachineBasicBlock::iterator MBBI = I; ++MBBI; - if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode()) - return E; - - // Don't merge volatiles. - if (MBBI->hasOrderedMemoryRef()) - return E; - - int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); - const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); - const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); - - // Check same base pointer. Be careful of subregisters, which can occur with - // vectors of pointers. - if (AddrReg0.getReg() == AddrReg1.getReg() && - AddrReg0.getSubReg() == AddrReg1.getSubReg()) { - int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), - AMDGPU::OpName::offset); - unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; - unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; - - // Check both offsets fit in the reduced range. - if (offsetsCanBeCombined(Offset0, Offset1, EltSize)) - return MBBI; - } + SmallVector<const MachineOperand *, 8> DefsToMove; + addDefsToList(*I, DefsToMove); + + for ( ; MBBI != E; ++MBBI) { + + if (MBBI->getOpcode() != I->getOpcode()) { + + // This is not a matching DS instruction, but we can keep looking as + // long as one of these conditions are met: + // 1. It is safe to move I down past MBBI. + // 2. It is safe to move MBBI down past the instruction that I will + // be merged into. + + if (MBBI->hasUnmodeledSideEffects()) + // We can't re-order this instruction with respect to other memory + // opeations, so we fail both conditions mentioned above. + return E; + if (MBBI->mayLoadOrStore() && + !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) { + // We fail condition #1, but we may still be able to satisfy condition + // #2. Add this instruction to the move list and then we will check + // if condition #2 holds once we have selected the matching instruction. + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + continue; + } + + // When we match I with another DS instruction we will be moving I down + // to the location of the matched instruction any uses of I will need to + // be moved down as well. + for (const MachineOperand *Def : DefsToMove) { + bool ReadDef = MBBI->readsVirtualRegister(Def->getReg()); + // If ReadDef is true, then there is a use of Def between I + // and the instruction that I will potentially be merged with. We + // will need to move this instruction after the merged instructions. + if (ReadDef) { + InstsToMove.push_back(&*MBBI); + addDefsToList(*MBBI, DefsToMove); + break; + } + } + continue; + } + + // Don't merge volatiles. + if (MBBI->hasOrderedMemoryRef()) + return E; + + int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr); + const MachineOperand &AddrReg0 = I->getOperand(AddrIdx); + const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx); + + // Check same base pointer. Be careful of subregisters, which can occur with + // vectors of pointers. + if (AddrReg0.getReg() == AddrReg1.getReg() && + AddrReg0.getSubReg() == AddrReg1.getSubReg()) { + int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), + AMDGPU::OpName::offset); + unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff; + unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff; + + // Check both offsets fit in the reduced range. + // We also need to go through the list of instructions that we plan to + // move and make sure they are all safe to move down past the merged + // instruction. + if (offsetsCanBeCombined(Offset0, Offset1, EltSize) && + canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA)) + return MBBI; + } + + // We've found a load/store that we couldn't merge for some reason. + // We could potentially keep looking, but we'd need to make sure that + // it was safe to move I and also all the instruction in InstsToMove + // down past this instruction. + // FIXME: This is too conservative. + break; + } return E; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be careful, since the addresses could be subregisters themselves in weird @@ -247,7 +329,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( DebugLoc DL = I->getDebugLoc(); MachineInstrBuilder Read2 - = BuildMI(*MBB, I, DL, Read2Desc, DestReg) + = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg) .addOperand(*AddrReg) // addr .addImm(NewOffset0) // offset0 .addImm(NewOffset1) // offset1 @@ -258,48 +340,28 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair( const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY); // Copy to the old destination registers. - MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc) + BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest0) // Copy to same destination including flags and sub reg. .addReg(DestReg, 0, SubRegIdx0); - MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc) + MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc) .addOperand(*Dest1) .addReg(DestReg, RegState::Kill, SubRegIdx1); - LIS->InsertMachineInstrInMaps(*Read2); - - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - - // The new write to the original destination register is now the copy. Steal - // the old SlotIndex. - LIS->ReplaceMachineInstrInMaps(*I, *Copy0); - LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1); + moveInstsAfter(Copy1, InstsToMove); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg()); - LIS->shrinkToUses(&AddrRegLI); - - LIS->createAndComputeVirtRegInterval(DestReg); - - if (UpdateM0Range) { - SlotIndex Read2Index = LIS->getInstructionIndex(*Read2); - M0Segment->end = Read2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Read2.getInstr(); + return Next; } MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, - unsigned EltSize) { + unsigned EltSize, + ArrayRef<MachineInstr*> InstsToMove) { MachineBasicBlock *MBB = I->getParent(); // Be sure to use .addOperand(), and not .addReg() with these. We want to be @@ -341,15 +403,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( const MCInstrDesc &Write2Desc = TII->get(Opc); DebugLoc DL = I->getDebugLoc(); - // repairLiveintervalsInRange() doesn't handle physical register, so we have - // to update the M0 range manually. - SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired); - LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI)); - LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex); - bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot(); - MachineInstrBuilder Write2 - = BuildMI(*MBB, I, DL, Write2Desc) + = BuildMI(*MBB, Paired, DL, Write2Desc) .addOperand(*Addr) // addr .addOperand(*Data0) // data0 .addOperand(*Data1) // data1 @@ -359,24 +414,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair( .addMemOperand(*I->memoperands_begin()) .addMemOperand(*Paired->memoperands_begin()); - // XXX - How do we express subregisters here? - unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() }; + moveInstsAfter(Write2, InstsToMove); - LIS->RemoveMachineInstrFromMaps(*I); - LIS->RemoveMachineInstrFromMaps(*Paired); + MachineBasicBlock::iterator Next = std::next(I); I->eraseFromParent(); Paired->eraseFromParent(); - // This doesn't handle physical registers like M0 - LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs); - - if (UpdateM0Range) { - SlotIndex Write2Index = LIS->getInstructionIndex(*Write2); - M0Segment->end = Write2Index.getRegSlot(); - } - DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Write2.getInstr(); + return Next; } // Scan through looking for adjacent LDS operations with constant offsets from @@ -394,13 +439,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } + SmallVector<MachineInstr*, 8> InstsToMove; unsigned Opc = MI.getOpcode(); if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) { unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeRead2Pair(I, Match, Size); + I = mergeRead2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -408,10 +455,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { continue; } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) { unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4; - MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size); + MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size, + InstsToMove); if (Match != E) { Modified = true; - I = mergeWrite2Pair(I, Match, Size); + I = mergeWrite2Pair(I, Match, Size, InstsToMove); } else { ++I; } @@ -437,8 +485,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { TRI = &TII->getRegisterInfo(); MRI = &MF.getRegInfo(); - - LIS = &getAnalysis<LiveIntervals>(); + AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); DEBUG(dbgs() << "Running SILoadStoreOptimizer\n"); diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll index d62782e5ec7..dab3c10d682 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll @@ -9,8 +9,8 @@ ; SI-LABEL: {{^}}offset_order: ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3 -; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 -; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 +; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14 +; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44 define void @offset_order(float addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll index 45fcc01b2ad..ae230dac937 100644 --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add } ; SI-LABEL: @simple_write2_two_val_f32_x2 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac } ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11 -; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8 +; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27 ; SI: s_endpgm define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 { %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll index fb5853b808e..f6cdbb48c75 100644 --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]] +; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll index 73a5c54e175..4c559f42d42 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -25,7 +25,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 { ; SI: v_rsq_clamp_f64_e32 ; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 +; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1 +; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]] ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll index ce311bfc1e6..9b0cbaa7701 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll @@ -73,8 +73,8 @@ entry: } ; FUNC-LABEL: {{^}}local_load_v16i16: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} ; EG: LDS_READ_RET @@ -287,11 +287,9 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1 ret void } -; FIXME: Should have 2 ds_read_b64 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; GCN: ds_write2_b64 ; GCN: ds_write2_b64 @@ -314,9 +312,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 } ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32: -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}} -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}} + +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -379,10 +377,18 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 } ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET @@ -407,17 +413,31 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 ret void } -; FIXME: Missed read2 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7 -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64 -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13 -; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112 +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 ; EG: LDS_READ_RET ; EG: LDS_READ_RET diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll index 03f0ffecb5d..271e563024c 100644 --- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll @@ -56,10 +56,14 @@ entry: } ; FUNC-LABEL: {{^}}local_load_v16i32: -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}} -; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}} +; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}} +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3 +; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1 define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 { entry: %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll index f6c0e3c6239..a57e4f59532 100644 --- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll +++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll @@ -44,8 +44,7 @@ entry: ; GCN-LABEL: {{^}}local_memory_two_objects: ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16 -; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}} +; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]] diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index 0e9618523e3..f6ed41d9dcd 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -156,7 +156,8 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, ; FUNC-LABEL: @reorder_local_offsets ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102 -; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400 +; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100 +; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408 ; CI: buffer_store_dword ; CI: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll index b4d7505e0a8..03ae01afbcd 100644 --- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll +++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll @@ -46,8 +46,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64 } ; GCN-LABEL: {{^}}local_store_v3i64: -; GCN: ds_write_b64 -; GCN: ds_write_b64 +; GCN: ds_write2_b64 ; GCN: ds_write_b64 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) { store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32 diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll index 0435ed4d552..7a72a4c7ba0 100644 --- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll +++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll @@ -42,12 +42,10 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa } ; GCN-LABEL: {{^}}test_use_s_v_s: -; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} -; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} - ; GCN: buffer_load_dword [[VA0:v[0-9]+]] -; GCN-NOT: v_mov_b32 ; GCN: buffer_load_dword [[VA1:v[0-9]+]] +; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}} +; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}} ; GCN-NOT: v_mov_b32 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] |

