summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp247
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/ds_write2.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/fceil64.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i16.ll54
-rw-r--r--llvm/test/CodeGen/AMDGPU/load-local-i32.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-v3i64.ll3
-rw-r--r--llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll6
12 files changed, 209 insertions, 150 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c8900fffc44..a86603a11ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -497,6 +497,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
+ addPass(&SILoadStoreOptimizerID);
}
void GCNPassConfig::addIRPasses() {
@@ -533,17 +534,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
#endif
void GCNPassConfig::addPreRegAlloc() {
- if (getOptLevel() > CodeGenOpt::None) {
- // Don't do this with no optimizations since it throws away debug info by
- // merging nonadjacent loads.
-
- // This should be run after scheduling, but before register allocation. It
- // also need extra copies to the address operand to be eliminated.
-
- // FIXME: Move pre-RA and remove extra reg coalescer run.
- insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
- insertPass(&MachineSchedulerID, &RegisterCoalescerID);
- }
addPass(createSIShrinkInstructionsPass());
addPass(createSIWholeQuadModePass());
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index eec9f98d12c..d80b157d8a4 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -60,31 +60,35 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
- LiveIntervals *LIS;
+ AliasAnalysis *AA;
static bool offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned EltSize);
- MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize);
+ MachineBasicBlock::iterator findMatchingDSInst(
+ MachineBasicBlock::iterator I,
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove);
MachineBasicBlock::iterator mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
MachineBasicBlock::iterator mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize);
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove);
public:
static char ID;
SILoadStoreOptimizer()
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
- LIS(nullptr) {}
+ AA(nullptr) {}
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@@ -98,17 +102,9 @@ public:
return "SI Load / Store Optimizer";
}
- MachineFunctionProperties getRequiredProperties() const override {
- return MachineFunctionProperties().set(
- MachineFunctionProperties::Property::NoPHIs);
- }
-
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
- AU.addPreserved<SlotIndexes>();
- AU.addPreserved<LiveIntervals>();
- AU.addPreserved<LiveVariables>();
- AU.addRequired<LiveIntervals>();
+ AU.addRequired<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -118,9 +114,7 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
@@ -132,6 +126,40 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
return new SILoadStoreOptimizer(TM);
}
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+ ArrayRef<MachineInstr*> InstsToMove) {
+ MachineBasicBlock *MBB = I->getParent();
+ ++I;
+ for (MachineInstr *MI : InstsToMove) {
+ MI->removeFromParent();
+ MBB->insert(I, MI);
+ }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+ SmallVectorImpl<const MachineOperand *> &Defs) {
+ for (const MachineOperand &Def : MI.defs()) {
+ Defs.push_back(&Def);
+ }
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+ ArrayRef<MachineInstr*> InstsToMove,
+ const SIInstrInfo *TII,
+ AliasAnalysis *AA) {
+
+ assert(MemOp.mayLoadOrStore());
+
+ for (MachineInstr *InstToMove : InstsToMove) {
+ if (!InstToMove->mayLoadOrStore())
+ continue;
+ if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
+ return false;
+ }
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned Size) {
@@ -161,44 +189,98 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
MachineBasicBlock::iterator
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
- unsigned EltSize){
+ unsigned EltSize,
+ SmallVectorImpl<MachineInstr*> &InstsToMove) {
MachineBasicBlock::iterator E = I->getParent()->end();
- MachineBasicBlock &MBB = *I->getParent();
MachineBasicBlock::iterator MBBI = I;
++MBBI;
- if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode())
- return E;
-
- // Don't merge volatiles.
- if (MBBI->hasOrderedMemoryRef())
- return E;
-
- int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
- const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
- const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
- // Check same base pointer. Be careful of subregisters, which can occur with
- // vectors of pointers.
- if (AddrReg0.getReg() == AddrReg1.getReg() &&
- AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
- int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
- AMDGPU::OpName::offset);
- unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
- unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
- // Check both offsets fit in the reduced range.
- if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
- return MBBI;
- }
+ SmallVector<const MachineOperand *, 8> DefsToMove;
+ addDefsToList(*I, DefsToMove);
+
+ for ( ; MBBI != E; ++MBBI) {
+
+ if (MBBI->getOpcode() != I->getOpcode()) {
+
+ // This is not a matching DS instruction, but we can keep looking as
+ // long as one of these conditions are met:
+ // 1. It is safe to move I down past MBBI.
+ // 2. It is safe to move MBBI down past the instruction that I will
+ // be merged into.
+
+ if (MBBI->hasUnmodeledSideEffects())
+ // We can't re-order this instruction with respect to other memory
+ // opeations, so we fail both conditions mentioned above.
+ return E;
+ if (MBBI->mayLoadOrStore() &&
+ !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+ // We fail condition #1, but we may still be able to satisfy condition
+ // #2. Add this instruction to the move list and then we will check
+ // if condition #2 holds once we have selected the matching instruction.
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ continue;
+ }
+
+ // When we match I with another DS instruction we will be moving I down
+ // to the location of the matched instruction any uses of I will need to
+ // be moved down as well.
+ for (const MachineOperand *Def : DefsToMove) {
+ bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
+ // If ReadDef is true, then there is a use of Def between I
+ // and the instruction that I will potentially be merged with. We
+ // will need to move this instruction after the merged instructions.
+ if (ReadDef) {
+ InstsToMove.push_back(&*MBBI);
+ addDefsToList(*MBBI, DefsToMove);
+ break;
+ }
+ }
+ continue;
+ }
+
+ // Don't merge volatiles.
+ if (MBBI->hasOrderedMemoryRef())
+ return E;
+
+ int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+ const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+ const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+ // Check same base pointer. Be careful of subregisters, which can occur with
+ // vectors of pointers.
+ if (AddrReg0.getReg() == AddrReg1.getReg() &&
+ AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+ AMDGPU::OpName::offset);
+ unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+ unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+ // Check both offsets fit in the reduced range.
+ // We also need to go through the list of instructions that we plan to
+ // move and make sure they are all safe to move down past the merged
+ // instruction.
+ if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+ canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+ return MBBI;
+ }
+
+ // We've found a load/store that we couldn't merge for some reason.
+ // We could potentially keep looking, but we'd need to make sure that
+ // it was safe to move I and also all the instruction in InstsToMove
+ // down past this instruction.
+ // FIXME: This is too conservative.
+ break;
+ }
return E;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@@ -247,7 +329,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
DebugLoc DL = I->getDebugLoc();
MachineInstrBuilder Read2
- = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+ = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
@@ -258,48 +340,28 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
- MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+ BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
- MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+ MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
- LIS->InsertMachineInstrInMaps(*Read2);
-
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
- // The new write to the original destination register is now the copy. Steal
- // the old SlotIndex.
- LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
- LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
+ moveInstsAfter(Copy1, InstsToMove);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
- LIS->shrinkToUses(&AddrRegLI);
-
- LIS->createAndComputeVirtRegInterval(DestReg);
-
- if (UpdateM0Range) {
- SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
- M0Segment->end = Read2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Read2.getInstr();
+ return Next;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
- unsigned EltSize) {
+ unsigned EltSize,
+ ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -341,15 +403,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
- // repairLiveintervalsInRange() doesn't handle physical register, so we have
- // to update the M0 range manually.
- SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
- LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
- LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
- bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
MachineInstrBuilder Write2
- = BuildMI(*MBB, I, DL, Write2Desc)
+ = BuildMI(*MBB, Paired, DL, Write2Desc)
.addOperand(*Addr) // addr
.addOperand(*Data0) // data0
.addOperand(*Data1) // data1
@@ -359,24 +414,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
- // XXX - How do we express subregisters here?
- unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+ moveInstsAfter(Write2, InstsToMove);
- LIS->RemoveMachineInstrFromMaps(*I);
- LIS->RemoveMachineInstrFromMaps(*Paired);
+ MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
- // This doesn't handle physical registers like M0
- LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
- if (UpdateM0Range) {
- SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
- M0Segment->end = Write2Index.getRegSlot();
- }
-
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Write2.getInstr();
+ return Next;
}
// Scan through looking for adjacent LDS operations with constant offsets from
@@ -394,13 +439,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
+ SmallVector<MachineInstr*, 8> InstsToMove;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeRead2Pair(I, Match, Size);
+ I = mergeRead2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -408,10 +455,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
- MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+ MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+ InstsToMove);
if (Match != E) {
Modified = true;
- I = mergeWrite2Pair(I, Match, Size);
+ I = mergeWrite2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@@ -437,8 +485,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
-
- LIS = &getAnalysis<LiveIntervals>();
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index d62782e5ec7..dab3c10d682 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -9,8 +9,8 @@
; SI-LABEL: {{^}}offset_order:
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
define void @offset_order(float addrspace(1)* %out) {
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 45fcc01b2ad..ae230dac937 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
}
; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
}
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index fb5853b808e..f6cdbb48c75 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; CI: v_ceil_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
; SI-DAG: s_not_b64
; SI-DAG: s_and_b64
; SI-DAG: cmp_gt_i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 73a5c54e175..4c559f42d42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -25,7 +25,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
; SI: v_rsq_clamp_f64_e32
; TODO: this constant should be folded:
-; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
+; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]]
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index ce311bfc1e6..9b0cbaa7701 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -73,8 +73,8 @@ entry:
}
; FUNC-LABEL: {{^}}local_load_v16i16:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; EG: LDS_READ_RET
@@ -287,11 +287,9 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
ret void
}
-; FIXME: Should have 2 ds_read_b64
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN: ds_write2_b64
; GCN: ds_write2_b64
@@ -314,9 +312,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
}
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
+
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; EG: LDS_READ_RET
; EG: LDS_READ_RET
@@ -379,10 +377,18 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
}
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET
@@ -407,17 +413,31 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
ret void
}
-; FIXME: Missed read2
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 03f0ffecb5d..271e563024c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -56,10 +56,14 @@ entry:
}
; FUNC-LABEL: {{^}}local_load_v16i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index f6c0e3c6239..a57e4f59532 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -44,8 +44,7 @@ entry:
; GCN-LABEL: {{^}}local_memory_two_objects:
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
+; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 0e9618523e3..f6ed41d9dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -156,7 +156,8 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
; FUNC-LABEL: @reorder_local_offsets
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
; CI: buffer_store_dword
; CI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index b4d7505e0a8..03ae01afbcd 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -46,8 +46,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
}
; GCN-LABEL: {{^}}local_store_v3i64:
-; GCN: ds_write_b64
-; GCN: ds_write_b64
+; GCN: ds_write2_b64
; GCN: ds_write_b64
define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 0435ed4d552..7a72a4c7ba0 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -42,12 +42,10 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
-; GCN-NOT: v_mov_b32
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
OpenPOWER on IntegriCloud