12 files changed, 209 insertions, 150 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index c8900fffc44..a86603a11ff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -497,6 +497,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
   // XXX - Can we get away without running DeadMachineInstructionElim again?
   addPass(&SIFoldOperandsID);
   addPass(&DeadMachineInstructionElimID);
+  addPass(&SILoadStoreOptimizerID);
 }
 
 void GCNPassConfig::addIRPasses() {
@@ -533,17 +534,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
-  if (getOptLevel() > CodeGenOpt::None) {
-    // Don't do this with no optimizations since it throws away debug info by
-    // merging nonadjacent loads.
-
-    // This should be run after scheduling, but before register allocation. It
-    // also need extra copies to the address operand to be eliminated.
-
-    // FIXME: Move pre-RA and remove extra reg coalescer run.
-    insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
-    insertPass(&MachineSchedulerID, &RegisterCoalescerID);
-  }
 
   addPass(createSIShrinkInstructionsPass());
   addPass(createSIWholeQuadModePass());
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index eec9f98d12c..d80b157d8a4 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -60,31 +60,35 @@ private:
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
   MachineRegisterInfo *MRI;
-  LiveIntervals *LIS;
+  AliasAnalysis *AA;
 
   static bool offsetsCanBeCombined(unsigned Offset0,
                                    unsigned Offset1,
                                    unsigned EltSize);
 
-  MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
-                                                 unsigned EltSize);
+  MachineBasicBlock::iterator findMatchingDSInst(
+    MachineBasicBlock::iterator I,
+    unsigned EltSize,
+    SmallVectorImpl<MachineInstr*> &InstsToMove);
 
   MachineBasicBlock::iterator mergeRead2Pair(
     MachineBasicBlock::iterator I,
     MachineBasicBlock::iterator Paired,
-    unsigned EltSize);
+    unsigned EltSize,
+    ArrayRef<MachineInstr*> InstsToMove);
 
   MachineBasicBlock::iterator mergeWrite2Pair(
     MachineBasicBlock::iterator I,
     MachineBasicBlock::iterator Paired,
-    unsigned EltSize);
+    unsigned EltSize,
+    ArrayRef<MachineInstr*> InstsToMove);
 
 public:
   static char ID;
 
   SILoadStoreOptimizer()
       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
-        LIS(nullptr) {}
+        AA(nullptr) {}
 
   SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@@ -98,17 +102,9 @@ public:
     return "SI Load / Store Optimizer";
   }
 
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-      MachineFunctionProperties::Property::NoPHIs);
-  }
-
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addPreserved<SlotIndexes>();
-    AU.addPreserved<LiveIntervals>();
-    AU.addPreserved<LiveVariables>();
-    AU.addRequired<LiveIntervals>();
+    AU.addRequired<AAResultsWrapperPass>();
 
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -118,9 +114,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
                       "SI Load / Store Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
                     "SI Load / Store Optimizer", false, false)
 
@@ -132,6 +126,40 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
   return new SILoadStoreOptimizer(TM);
 }
 
+static void moveInstsAfter(MachineBasicBlock::iterator I,
+                           ArrayRef<MachineInstr*> InstsToMove) {
+  MachineBasicBlock *MBB = I->getParent();
+  ++I;
+  for (MachineInstr *MI : InstsToMove) {
+    MI->removeFromParent();
+    MBB->insert(I, MI);
+  }
+}
+
+static void addDefsToList(const MachineInstr &MI,
+                          SmallVectorImpl<const MachineOperand *> &Defs) {
+  for (const MachineOperand &Def : MI.defs()) {
+    Defs.push_back(&Def);
+  }
+}
+
+static bool
+canMoveInstsAcrossMemOp(MachineInstr &MemOp,
+                        ArrayRef<MachineInstr*> InstsToMove,
+                        const SIInstrInfo *TII,
+                        AliasAnalysis *AA) {
+
+  assert(MemOp.mayLoadOrStore());
+
+  for (MachineInstr *InstToMove : InstsToMove) {
+    if (!InstToMove->mayLoadOrStore())
+      continue;
+    if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
+      return false;
+  }
+  return true;
+}
+
 bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
                                                 unsigned Offset1,
                                                 unsigned Size) {
@@ -161,44 +189,98 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
 
 MachineBasicBlock::iterator
 SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
-                                         unsigned EltSize){
+                                  unsigned EltSize,
+                                  SmallVectorImpl<MachineInstr*> &InstsToMove) {
   MachineBasicBlock::iterator E = I->getParent()->end();
-  MachineBasicBlock &MBB = *I->getParent();
   MachineBasicBlock::iterator MBBI = I;
   ++MBBI;
 
-  if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode())
-    return E;
-
-  // Don't merge volatiles.
-  if (MBBI->hasOrderedMemoryRef())
-    return E;
-
-  int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
-  const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
-  const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
-
-  // Check same base pointer. Be careful of subregisters, which can occur with
-  // vectors of pointers.
-  if (AddrReg0.getReg() == AddrReg1.getReg() &&
-      AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
-    int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
-                                               AMDGPU::OpName::offset);
-    unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
-    unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
-
-    // Check both offsets fit in the reduced range.
-    if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
-      return MBBI;
-  }
+  SmallVector<const MachineOperand *, 8> DefsToMove;
+  addDefsToList(*I, DefsToMove);
+
+  for ( ; MBBI != E; ++MBBI) {
+
+    if (MBBI->getOpcode() != I->getOpcode()) {
+
+      // This is not a matching DS instruction, but we can keep looking as
+      // long as one of these conditions are met:
+      // 1. It is safe to move I down past MBBI.
+      // 2. It is safe to move MBBI down past the instruction that I will
+      //    be merged into.
+
+      if (MBBI->hasUnmodeledSideEffects())
+        // We can't re-order this instruction with respect to other memory
+        // opeations, so we fail both conditions mentioned above.
+        return E;
 
+      if (MBBI->mayLoadOrStore() &&
+          !TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
+        // We fail condition #1, but we may still be able to satisfy condition
+        // #2.  Add this instruction to the move list and then we will check
+        // if condition #2 holds once we have selected the matching instruction.
+        InstsToMove.push_back(&*MBBI);
+        addDefsToList(*MBBI, DefsToMove);
+        continue;
+      }
+
+      // When we match I with another DS instruction we will be moving I down
+      // to the location of the matched instruction any uses of I will need to
+      // be moved down as well.
+      for (const MachineOperand *Def : DefsToMove) {
+        bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
+        // If ReadDef is true, then there is a use of Def between I
+        // and the instruction that I will potentially be merged with. We
+        // will need to move this instruction after the merged instructions.
+        if (ReadDef) {
+          InstsToMove.push_back(&*MBBI);
+          addDefsToList(*MBBI, DefsToMove);
+          break;
+        }
+      }
+      continue;
+    }
+
+    // Don't merge volatiles.
+    if (MBBI->hasOrderedMemoryRef())
+      return E;
+
+    int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
+    const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
+    const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
+
+    // Check same base pointer. Be careful of subregisters, which can occur with
+    // vectors of pointers.
+    if (AddrReg0.getReg() == AddrReg1.getReg() &&
+        AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
+      int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
+                                                 AMDGPU::OpName::offset);
+      unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
+      unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
+
+      // Check both offsets fit in the reduced range.
+      // We also need to go through the list of instructions that we plan to
+      // move and make sure they are all safe to move down past the merged
+      // instruction.
+      if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
+          canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
+        return MBBI;
+    }
+
+    // We've found a load/store that we couldn't merge for some reason.
+    // We could potentially keep looking, but we'd need to make sure that
+    // it was safe to move I and also all the instruction in InstsToMove
+    // down past this instruction.
+    // FIXME: This is too conservative.
+    break;
+  }
   return E;
 }
 
 MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   MachineBasicBlock::iterator I,
   MachineBasicBlock::iterator Paired,
-  unsigned EltSize) {
+  unsigned EltSize,
+  ArrayRef<MachineInstr*> InstsToMove) {
   MachineBasicBlock *MBB = I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -247,7 +329,7 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
 
   DebugLoc DL = I->getDebugLoc();
   MachineInstrBuilder Read2
-    = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
+    = BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
     .addOperand(*AddrReg) // addr
     .addImm(NewOffset0) // offset0
     .addImm(NewOffset1) // offset1
@@ -258,48 +340,28 @@ MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
 
   // Copy to the old destination registers.
-  MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
+  BuildMI(*MBB, Paired, DL, CopyDesc)
     .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
     .addReg(DestReg, 0, SubRegIdx0);
-  MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
+  MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
     .addOperand(*Dest1)
     .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  LIS->InsertMachineInstrInMaps(*Read2);
-
-  // repairLiveintervalsInRange() doesn't handle physical register, so we have
-  // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
-  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
-  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
-  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
-  // The new write to the original destination register is now the copy. Steal
-  // the old SlotIndex.
-  LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
-  LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
+  moveInstsAfter(Copy1, InstsToMove);
 
+  MachineBasicBlock::iterator Next = std::next(I);
   I->eraseFromParent();
   Paired->eraseFromParent();
 
-  LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
-  LIS->shrinkToUses(&AddrRegLI);
-
-  LIS->createAndComputeVirtRegInterval(DestReg);
-
-  if (UpdateM0Range) {
-    SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
-    M0Segment->end = Read2Index.getRegSlot();
-  }
-
   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
-  return Read2.getInstr();
+  return Next;
 }
 
 MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   MachineBasicBlock::iterator I,
   MachineBasicBlock::iterator Paired,
-  unsigned EltSize) {
+  unsigned EltSize,
+  ArrayRef<MachineInstr*> InstsToMove) {
   MachineBasicBlock *MBB = I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -341,15 +403,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
   const MCInstrDesc &Write2Desc = TII->get(Opc);
   DebugLoc DL = I->getDebugLoc();
 
-  // repairLiveintervalsInRange() doesn't handle physical register, so we have
-  // to update the M0 range manually.
-  SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
-  LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
-  LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
-  bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
-
   MachineInstrBuilder Write2
-    = BuildMI(*MBB, I, DL, Write2Desc)
+    = BuildMI(*MBB, Paired, DL, Write2Desc)
     .addOperand(*Addr) // addr
     .addOperand(*Data0) // data0
     .addOperand(*Data1) // data1
@@ -359,24 +414,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
     .addMemOperand(*I->memoperands_begin())
     .addMemOperand(*Paired->memoperands_begin());
 
-  // XXX - How do we express subregisters here?
-  unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
+  moveInstsAfter(Write2, InstsToMove);
 
-  LIS->RemoveMachineInstrFromMaps(*I);
-  LIS->RemoveMachineInstrFromMaps(*Paired);
+  MachineBasicBlock::iterator Next = std::next(I);
   I->eraseFromParent();
   Paired->eraseFromParent();
 
-  // This doesn't handle physical registers like M0
-  LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
-
-  if (UpdateM0Range) {
-    SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
-    M0Segment->end = Write2Index.getRegSlot();
-  }
-
   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
-  return Write2.getInstr();
+  return Next;
 }
 
 // Scan through looking for adjacent LDS operations with constant offsets from
@@ -394,13 +439,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
       continue;
     }
 
+    SmallVector<MachineInstr*, 8> InstsToMove;
     unsigned Opc = MI.getOpcode();
     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
       unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+                                                             InstsToMove);
       if (Match != E) {
         Modified = true;
-        I = mergeRead2Pair(I, Match, Size);
+        I = mergeRead2Pair(I, Match, Size, InstsToMove);
       } else {
         ++I;
       }
@@ -408,10 +455,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
       continue;
     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
       unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
-      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
+      MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
+                                                             InstsToMove);
       if (Match != E) {
         Modified = true;
-        I = mergeWrite2Pair(I, Match, Size);
+        I = mergeWrite2Pair(I, Match, Size, InstsToMove);
       } else {
         ++I;
       }
@@ -437,8 +485,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TRI = &TII->getRegisterInfo();
 
   MRI = &MF.getRegInfo();
-
-  LIS = &getAnalysis<LiveIntervals>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
 
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index d62782e5ec7..dab3c10d682 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -9,8 +9,8 @@
 ; SI-LABEL: {{^}}offset_order:
 
 ; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
-; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
-; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
+; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
 
 define void @offset_order(float addrspace(1)* %out) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 45fcc01b2ad..ae230dac937 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 }
 
 ; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
-; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
+; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
 define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index fb5853b808e..f6cdbb48c75 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; CI: v_ceil_f64_e32
 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
-; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
+; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
+; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
 ; SI-DAG: s_not_b64
 ; SI-DAG: s_and_b64
 ; SI-DAG: cmp_gt_i32
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 73a5c54e175..4c559f42d42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -25,7 +25,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
 ; SI: v_rsq_clamp_f64_e32
 
 ; TODO: this constant should be folded:
-; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
+; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1
+; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]]
 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
index ce311bfc1e6..9b0cbaa7701 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -73,8 +73,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i16:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
 
 
 ; EG: LDS_READ_RET
@@ -287,11 +287,9 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
   ret void
 }
 
-; FIXME: Should have 2 ds_read_b64
 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 
 ; GCN: ds_write2_b64
 ; GCN: ds_write2_b64
@@ -314,9 +312,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
+
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
@@ -379,10 +377,18 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
@@ -407,17 +413,31 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
   ret void
 }
 
-; FIXME: Missed read2
 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
-; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
index 03f0ffecb5d..271e563024c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -56,10 +56,14 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i32:
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}}
-; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
+; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
+; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
 define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
diff --git a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index f6c0e3c6239..a57e4f59532 100644
--- a/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -44,8 +44,7 @@ entry:
 
 ; GCN-LABEL: {{^}}local_memory_two_objects:
 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
-; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
+; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
 
 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
 
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 0e9618523e3..f6ed41d9dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -156,7 +156,8 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 
 ; FUNC-LABEL: @reorder_local_offsets
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
index b4d7505e0a8..03ae01afbcd 100644
--- a/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -46,8 +46,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
 }
 
 ; GCN-LABEL: {{^}}local_store_v3i64:
-; GCN: ds_write_b64
-; GCN: ds_write_b64
+; GCN: ds_write2_b64
 ; GCN: ds_write_b64
 define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 0435ed4d552..7a72a4c7ba0 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -42,12 +42,10 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}test_use_s_v_s:
-; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
-; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
-
 ; GCN: buffer_load_dword [[VA0:v[0-9]+]]
-; GCN-NOT: v_mov_b32
 ; GCN: buffer_load_dword [[VA1:v[0-9]+]]
+; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
+; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
 
 ; GCN-NOT: v_mov_b32
 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]