19 files changed, 827 insertions, 281 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 80766086e15..a620e85101e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -48,7 +48,6 @@ FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIFixSGPRLiveRangesPass();
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
-FunctionPass *createSIPrepareScratchRegs();
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 04a0c1d06af..85a06882ffe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1064,34 +1064,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratch(SDValue Addr, SDValue &Rsrc,
   MachineFunction &MF = CurDAG->getMachineFunction();
   const SIRegisterInfo *TRI =
       static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const SITargetLowering& Lowering =
-    *static_cast<const SITargetLowering*>(getTargetLowering());
-
-  unsigned ScratchOffsetReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-  Lowering.CreateLiveInRegister(*CurDAG, &AMDGPU::SReg_32RegClass,
-                                ScratchOffsetReg, MVT::i32);
-  SDValue Sym0 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD0", MVT::i32);
-  SDValue ScratchRsrcDword0 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym0), 0);
-
-  SDValue Sym1 = CurDAG->getExternalSymbol("SCRATCH_RSRC_DWORD1", MVT::i32);
-  SDValue ScratchRsrcDword1 =
-      SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, Sym1), 0);
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
-  const SDValue RsrcOps[] = {
-      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
-      ScratchRsrcDword0,
-      CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
-      ScratchRsrcDword1,
-      CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
-  };
-  SDValue ScratchPtr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::v2i32, RsrcOps), 0);
-  Rsrc = SDValue(Lowering.buildScratchRSRC(*CurDAG, DL, ScratchPtr), 0);
-  SOffset = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), DL,
-      MRI.getLiveInVirtReg(ScratchOffsetReg), MVT::i32);
+  unsigned ScratchOffsetReg
+    = TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+  Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
+  SOffset = CurDAG->getRegister(ScratchOffsetReg, MVT::i32);
 
   // (add n0, c1)
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 4e31c7ab4d4..7b0445db4df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -327,7 +327,6 @@ void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
 }
 
 void GCNPassConfig::addPostRegAlloc() {
-  addPass(createSIPrepareScratchRegs(), false);
   addPass(createSIShrinkInstructionsPass(), false);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7a4b5bb6d35..64c9e1882e4 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -57,7 +57,6 @@ add_llvm_target(AMDGPUCodeGen
   SILowerControlFlow.cpp
   SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
-  SIPrepareScratchRegs.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a2d8fa1b0a1..6aff4b5700d 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -8,17 +8,89 @@
 //==-----------------------------------------------------------------------===//
 
 #include "SIFrameLowering.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 
 using namespace llvm;
 
+
+static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
+                              const MachineFrameInfo *FrameInfo) {
+  if (!FuncInfo->hasSpilledSGPRs())
+    return false;
+
+  if (FuncInfo->hasSpilledVGPRs())
+    return false;
+
+  for (int I = FrameInfo->getObjectIndexBegin(),
+         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
+    if (!FrameInfo->isSpillSlotObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+void SIFrameLowering::emitPrologue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  if (!MF.getFrameInfo()->hasStackObjects())
+    return;
+
+  assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // If we only have SGPR spills, we won't actually be using scratch memory
+  // since these spill to VGPRs.
+  //
+  // FIXME: We should be cleaning up these unused SGPR spill frame indices
+  // somewhere.
+  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
+    return;
+
+  const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+
+  // We need to insert initialization of the scratch resource descriptor.
+  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
+  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+
+  uint64_t Rsrc23 = TII->getScratchRsrcWords23();
+  MachineBasicBlock::iterator I = MBB.begin();
+  DebugLoc DL;
+
+  unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+  unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+  unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+  unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
+    .addExternalSymbol("SCRATCH_RSRC_DWORD0");
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
+    .addExternalSymbol("SCRATCH_RSRC_DWORD1");
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
+    .addImm(Rsrc23 & 0xffffffff);
+
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
+    .addImm(Rsrc23 >> 32);
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
   MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  if (!MFI->hasStackObjects())
+    return;
+
   bool MayNeedScavengingEmergencySlot = MFI->hasStackObjects();
 
   assert((RS || !MayNeedScavengingEmergencySlot) &&
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 677128d6ce0..a9152fd8b2a 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -21,6 +21,9 @@ public:
     AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override {}
 
+  void emitPrologue(MachineFunction &MF,
+                    MachineBasicBlock &MBB) const override;
+
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 94fad32c007..51cbc95bc07 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -552,6 +552,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   MachineFunction &MF = DAG.getMachineFunction();
   FunctionType *FType = MF.getFunction()->getFunctionType();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
 
   if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) {
     const Function *Fn = MF.getFunction();
@@ -622,9 +623,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   // The pointer to the scratch buffer is stored in SGPR2, SGPR3
   if (Info->getShaderType() == ShaderType::COMPUTE) {
     if (Subtarget->isAmdHsaOS())
-      Info->NumUserSGPRs = 2;  // FIXME: Need to support scratch buffers.
+      Info->NumUserSGPRs += 4;  // FIXME: Need to support scratch buffers.
     else
-      Info->NumUserSGPRs = 4;
+      Info->NumUserSGPRs += 4;
 
     unsigned InputPtrReg =
         TRI->getPreloadedValue(MF, SIRegisterInfo::INPUT_PTR);
@@ -750,6 +751,9 @@ SDValue SITargetLowering::LowerFormalArguments(
     Info->ScratchOffsetReg = AMDGPU::SGPR_32RegClass.getRegister(ScratchIdx);
   }
 
+  if (MF.getFrameInfo()->hasStackObjects() || ST.isVGPRSpillingEnabled(Info))
+    Info->setScratchRSrcReg(TRI);
+
   if (Chains.empty())
     return Chain;
 
@@ -2335,15 +2339,6 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG,
   return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
 }
 
-MachineSDNode *SITargetLowering::buildScratchRSRC(SelectionDAG &DAG,
-                                                  SDLoc DL,
-                                                  SDValue Ptr) const {
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
-
-  return buildRSRC(DAG, DL, Ptr, 0, TII->getScratchRsrcWords23());
-}
-
 SDValue SITargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                const TargetRegisterClass *RC,
                                                unsigned Reg, EVT VT) const {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a358e3fc3c0..0659dd7d5d0 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -116,10 +116,6 @@ public:
                            SDValue Ptr,
                            uint32_t RsrcDword1,
                            uint64_t RsrcDword2And3) const;
-  MachineSDNode *buildScratchRSRC(SelectionDAG &DAG,
-                                  SDLoc DL,
-                                  SDValue Ptr) const;
-
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 9a85a1d515f..b7d2a471275 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -551,15 +551,16 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
+  unsigned ScratchOffsetPreloadReg
+    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
   unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize());
   MFI->setHasSpilledVGPRs();
   BuildMI(MBB, MI, DL, get(Opcode))
     .addReg(SrcReg)                   // src
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
     .addMemOperand(MMO);
 }
 
@@ -637,13 +638,14 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
 
+  unsigned ScratchOffsetPreloadReg
+    = RI.getPreloadedValue(*MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+
   unsigned Opcode = getVGPRSpillRestoreOpcode(RC->getSize());
   BuildMI(MBB, MI, DL, get(Opcode), DestReg)
     .addFrameIndex(FrameIndex)        // frame_idx
-    // Place-holder registers, these will be filled in by
-    // SIPrepareScratchRegs.
-    .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-    .addReg(AMDGPU::SGPR0, RegState::Undef)
+    .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+    .addReg(ScratchOffsetPreloadReg)  // scratch_offset
     .addMemOperand(MMO);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 6269dce553f..d042844aa13 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -68,6 +68,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkItemIDZ = true;
 }
 
+void SIMachineFunctionInfo::setScratchRSrcReg(const SIRegisterInfo *TRI) {
+  // We need to round up to next multiple of 4.
+  unsigned NextSReg128 = RoundUpToAlignment(NumUserSGPRs + 5, 4);
+  unsigned RegSub0 = AMDGPU::SReg_32RegClass.getRegister(NextSReg128);
+  ScratchRSrcReg = TRI->getMatchingSuperReg(RegSub0, AMDGPU::sub0,
+                                            &AMDGPU::SReg_128RegClass);
+}
+
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
diff --git a/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp b/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
deleted file mode 100644
index a6c22775e09..00000000000
--- a/llvm/lib/Target/AMDGPU/SIPrepareScratchRegs.cpp
+++ /dev/null
@@ -1,196 +0,0 @@
-//===-- SIPrepareScratchRegs.cpp - Use predicates for control flow --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This pass loads scratch pointer and scratch offset into a register or a
-/// frame index which can be used anywhere in the program.  These values will
-/// be used for spilling VGPRs.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "SIDefines.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-
-using namespace llvm;
-
-namespace {
-
-class SIPrepareScratchRegs : public MachineFunctionPass {
-
-private:
-  static char ID;
-
-public:
-  SIPrepareScratchRegs() : MachineFunctionPass(ID) { }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  const char *getPassName() const override {
-    return "SI prepare scratch registers";
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-
-} // End anonymous namespace
-
-char SIPrepareScratchRegs::ID = 0;
-
-FunctionPass *llvm::createSIPrepareScratchRegs() {
-  return new SIPrepareScratchRegs();
-}
-
-bool SIPrepareScratchRegs::runOnMachineFunction(MachineFunction &MF) {
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-  MachineBasicBlock *Entry = &MF.front();
-  MachineBasicBlock::iterator I = Entry->begin();
-  DebugLoc DL = I->getDebugLoc();
-
-  // FIXME: If we don't have enough VGPRs for SGPR spilling we will need to
-  // run this pass.
-  if (!MFI->hasSpilledVGPRs())
-    return false;
-
-  unsigned ScratchPtrPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_PTR);
-  unsigned ScratchOffsetPreloadReg =
-      TRI->getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
-
-  if (!Entry->isLiveIn(ScratchPtrPreloadReg))
-    Entry->addLiveIn(ScratchPtrPreloadReg);
-
-  if (!Entry->isLiveIn(ScratchOffsetPreloadReg))
-    Entry->addLiveIn(ScratchOffsetPreloadReg);
-
-  // Load the scratch offset.
-  unsigned ScratchOffsetReg =
-      TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_32RegClass);
-  int ScratchOffsetFI = -1;
-
-  if (ScratchOffsetReg != AMDGPU::NoRegister) {
-    // Found an SGPR to use
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::S_MOV_B32), ScratchOffsetReg)
-            .addReg(ScratchOffsetPreloadReg);
-  } else {
-    // No SGPR is available, we must spill.
-    ScratchOffsetFI = FrameInfo->CreateSpillStackObject(4,4);
-    BuildMI(*Entry, I, DL, TII->get(AMDGPU::SI_SPILL_S32_SAVE))
-            .addReg(ScratchOffsetPreloadReg)
-            .addFrameIndex(ScratchOffsetFI)
-            .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-            .addReg(AMDGPU::SGPR0, RegState::Undef);
-  }
-
-
-  // Now that we have the scratch pointer and offset values, we need to
-  // add them to all the SI_SPILL_V* instructions.
-
-  RegScavenger RS;
-  unsigned ScratchRsrcFI = FrameInfo->CreateSpillStackObject(16, 4);
-  RS.addScavengingFrameIndex(ScratchRsrcFI);
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; ++BI) {
-
-    MachineBasicBlock &MBB = *BI;
-    // Add the scratch offset reg as a live-in so that the register scavenger
-    // doesn't re-use it.
-    if (!MBB.isLiveIn(ScratchOffsetReg) &&
-        ScratchOffsetReg != AMDGPU::NoRegister)
-      MBB.addLiveIn(ScratchOffsetReg);
-    RS.enterBasicBlock(&MBB);
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      MachineInstr &MI = *I;
-      RS.forward(I);
-      DebugLoc DL = MI.getDebugLoc();
-      if (!TII->isVGPRSpill(MI))
-        continue;
-
-      // Scratch resource
-      unsigned ScratchRsrcReg =
-          RS.scavengeRegister(&AMDGPU::SReg_128RegClass, 0);
-
-      uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-
-      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
-      unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
-      unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc0)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc1)
-              .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc2)
-              .addImm(Rsrc23 & 0xffffffff)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), Rsrc3)
-              .addImm(Rsrc23 >> 32)
-              .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
-      // Scratch Offset
-      if (ScratchOffsetReg == AMDGPU::NoRegister) {
-        ScratchOffsetReg = RS.scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-        BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_SPILL_S32_RESTORE),
-                ScratchOffsetReg)
-                .addFrameIndex(ScratchOffsetFI)
-                .addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Undef)
-                .addReg(AMDGPU::SGPR0, RegState::Undef);
-      } else if (!MBB.isLiveIn(ScratchOffsetReg)) {
-        MBB.addLiveIn(ScratchOffsetReg);
-      }
-
-      if (ScratchRsrcReg == AMDGPU::NoRegister ||
-          ScratchOffsetReg == AMDGPU::NoRegister) {
-        LLVMContext &Ctx = MF.getFunction()->getContext();
-        Ctx.emitError("ran out of SGPRs for spilling VGPRs");
-        ScratchRsrcReg = AMDGPU::SGPR0;
-        ScratchOffsetReg = AMDGPU::SGPR0;
-      }
-      MI.getOperand(2).setReg(ScratchRsrcReg);
-      MI.getOperand(2).setIsKill(true);
-      MI.getOperand(2).setIsUndef(false);
-      MI.getOperand(3).setReg(ScratchOffsetReg);
-      MI.getOperand(3).setIsUndef(false);
-      MI.getOperand(3).setIsKill(false);
-      MI.addOperand(MachineOperand::CreateReg(Rsrc0, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc1, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc2, false, true, true));
-      MI.addOperand(MachineOperand::CreateReg(Rsrc3, false, true, true));
-    }
-  }
-  return true;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ab7539b6fb3..b392c86fa2e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -68,6 +68,22 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
+  if (ScratchRSrcReg != AMDGPU::NoRegister) {
+    unsigned ScratchOffsetPreloadReg
+      = getPreloadedValue(MF, SIRegisterInfo::SCRATCH_WAVE_OFFSET);
+    // We will need to use this user SGPR argument for spilling, and thus never
+    // want it to be spilled.
+    reserveRegisterTuples(Reserved, ScratchOffsetPreloadReg);
+
+    // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
+    // to spill.
+    // TODO: May need to reserve a VGPR if doing LDS spilling.
+    reserveRegisterTuples(Reserved, ScratchRSrcReg);
+    assert(!isSubRegister(ScratchRSrcReg, ScratchOffsetPreloadReg));
+  }
+
   return Reserved;
 }
 
@@ -243,6 +259,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(SubReg)
                 .addImm(Spill.Lane);
 
+        // FIXME: Since this spills to another register instead of an actual
+        // frame index, we should delete the frame index when all references to
+        // it are fixed.
       }
       MI->eraseFromParent();
       break;
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
index 803b2ecced0..e9d98ac89e7 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll
@@ -294,8 +294,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
-; VI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x44
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
+; VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
@@ -311,7 +311,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
-; SI: s_load_dwordx8 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x11
+; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
@@ -413,8 +413,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
@@ -438,8 +438,8 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].Z
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[9].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[10].X
-; SI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x19
-; VI: s_load_dwordx16 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x64
+; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
+; VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
new file mode 100644
index 00000000000..5e8cf5bb3d2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -0,0 +1,42 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; XUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
+
+; FIXME: align on alloca seems to be ignored for private_segment_alignment
+
+; ALL-LABEL: {{^}}large_alloca_compute_shader:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s14, -1
+; CI: s_mov_b32 s15, 0x80f000
+; VI: s_mov_b32 s15, 0x800000
+
+
+; GCNHSA: .amd_kernel_code_t
+; GCNHSA: private_segment_alignment = 4
+; GCNHSA: .end_amd_kernel_code_t
+
+; GCNHSA: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCNHSA: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCNHSA: s_mov_b32 s10, -1
+; CIHSA: s_mov_b32 s11, 0x180f000
+; VIHSA: s_mov_b32 s11, 0x11800000
+
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s6 offen
+
+; Scratch size = alloca size + emergency stack slot
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
new file mode 100644
index 00000000000..208b9a10050
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg:
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN: s_mov_b32 s10, -1
+; CI: s_mov_b32 s11, 0x80f000
+; VI: s_mov_b32 s11, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
+
+; ALL: ; ScratchSize: 32772
+define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 {
+  %large = alloca [8192 x i32], align 4
+  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
+  store volatile i32 %x, i32* %gep
+  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
+  %val = load volatile i32, i32* %gep1
+  store volatile i32 %val, i32 addrspace(1)* undef
+  ret void
+}
+
+attributes #0 = { nounwind  }
+attributes #1 = { nounwind "ShaderType"="0" }
diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca.ll b/llvm/test/CodeGen/AMDGPU/large-alloca.ll
deleted file mode 100644
index e1122da78ef..00000000000
--- a/llvm/test/CodeGen/AMDGPU/large-alloca.ll
+++ /dev/null
@@ -1,18 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-; GCN-LABEL: {{^}}large_alloca:
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-; GCN: ScratchSize: 32776
-define void @large_alloca(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
-  %large = alloca [8192 x i32], align 4
-  %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
-  store i32 %x, i32* %gep
-  %gep1 = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 %y
-  %load = load i32, i32* %gep1
-  store i32 %load, i32 addrspace(1)* %out
-  ret void
-}
-
-attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index 84652701f77..d7b35fc631e 100644
--- a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -6,6 +6,16 @@
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: s_wqm
+
+; Make sure not emitting unused scratch resource descriptor setup
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+; CHECK-NOT: s_mov_b32
+
+; CHECK: s_mov_b32 m0
+
+
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; CHECK-NOT: s_buffer_load_dword m0
 ; CHECK: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
new file mode 100644
index 00000000000..2cbb67d85fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -0,0 +1,583 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
+; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+
+; This ends up using all 256 registers and requires register
+; scavenging which will fail to find an unsued register.
+
+; Check the ScratchSize to avoid regressions from spilling
+; intermediate register class copies.
+
+; FIXME: The same register is initialized to 0 for every spill.
+
+declare i32 @llvm.r600.read.tgid.x() #1
+declare i32 @llvm.r600.read.tgid.y() #1
+declare i32 @llvm.r600.read.tgid.z() #1
+
+; GCN-LABEL: {{^}}spill_vgpr_compute:
+
+; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s14, -1
+; SI-NEXT: s_mov_b32 s15, 0x80f000
+; VI-NEXT: s_mov_b32 s15, 0x800000
+
+; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s8 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[12:15], s8 offen offset:{{[0-9]+}}
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
+
+; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
+define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+bb:
+  %tmp = add i32 %arg1, %arg2
+  %tmp7 = extractelement <4 x float> %arg6, i32 0
+  %tmp8 = extractelement <4 x float> %arg6, i32 1
+  %tmp9 = extractelement <4 x float> %arg6, i32 2
+  %tmp10 = extractelement <4 x float> %arg6, i32 3
+  %tmp11 = bitcast float %arg5 to i32
+  br label %bb12
+
+bb12:                                             ; preds = %bb145, %bb
+  %tmp13 = phi float [ 0.000000e+00, %bb ], [ %tmp338, %bb145 ]
+  %tmp14 = phi float [ 0.000000e+00, %bb ], [ %tmp337, %bb145 ]
+  %tmp15 = phi float [ 0.000000e+00, %bb ], [ %tmp336, %bb145 ]
+  %tmp16 = phi float [ 0.000000e+00, %bb ], [ %tmp339, %bb145 ]
+  %tmp17 = phi float [ 0.000000e+00, %bb ], [ %tmp335, %bb145 ]
+  %tmp18 = phi float [ 0.000000e+00, %bb ], [ %tmp334, %bb145 ]
+  %tmp19 = phi float [ 0.000000e+00, %bb ], [ %tmp333, %bb145 ]
+  %tmp20 = phi float [ 0.000000e+00, %bb ], [ %tmp340, %bb145 ]
+  %tmp21 = phi float [ 0.000000e+00, %bb ], [ %tmp332, %bb145 ]
+  %tmp22 = phi float [ 0.000000e+00, %bb ], [ %tmp331, %bb145 ]
+  %tmp23 = phi float [ 0.000000e+00, %bb ], [ %tmp330, %bb145 ]
+  %tmp24 = phi float [ 0.000000e+00, %bb ], [ %tmp341, %bb145 ]
+  %tmp25 = phi float [ 0.000000e+00, %bb ], [ %tmp329, %bb145 ]
+  %tmp26 = phi float [ 0.000000e+00, %bb ], [ %tmp328, %bb145 ]
+  %tmp27 = phi float [ 0.000000e+00, %bb ], [ %tmp327, %bb145 ]
+  %tmp28 = phi float [ 0.000000e+00, %bb ], [ %tmp342, %bb145 ]
+  %tmp29 = phi float [ 0.000000e+00, %bb ], [ %tmp326, %bb145 ]
+  %tmp30 = phi float [ 0.000000e+00, %bb ], [ %tmp325, %bb145 ]
+  %tmp31 = phi float [ 0.000000e+00, %bb ], [ %tmp324, %bb145 ]
+  %tmp32 = phi float [ 0.000000e+00, %bb ], [ %tmp343, %bb145 ]
+  %tmp33 = phi float [ 0.000000e+00, %bb ], [ %tmp323, %bb145 ]
+  %tmp34 = phi float [ 0.000000e+00, %bb ], [ %tmp322, %bb145 ]
+  %tmp35 = phi float [ 0.000000e+00, %bb ], [ %tmp321, %bb145 ]
+  %tmp36 = phi float [ 0.000000e+00, %bb ], [ %tmp344, %bb145 ]
+  %tmp37 = phi float [ 0.000000e+00, %bb ], [ %tmp320, %bb145 ]
+  %tmp38 = phi float [ 0.000000e+00, %bb ], [ %tmp319, %bb145 ]
+  %tmp39 = phi float [ 0.000000e+00, %bb ], [ %tmp318, %bb145 ]
+  %tmp40 = phi float [ 0.000000e+00, %bb ], [ %tmp345, %bb145 ]
+  %tmp41 = phi float [ 0.000000e+00, %bb ], [ %tmp317, %bb145 ]
+  %tmp42 = phi float [ 0.000000e+00, %bb ], [ %tmp316, %bb145 ]
+  %tmp43 = phi float [ 0.000000e+00, %bb ], [ %tmp315, %bb145 ]
+  %tmp44 = phi float [ 0.000000e+00, %bb ], [ %tmp346, %bb145 ]
+  %tmp45 = phi float [ 0.000000e+00, %bb ], [ %tmp314, %bb145 ]
+  %tmp46 = phi float [ 0.000000e+00, %bb ], [ %tmp313, %bb145 ]
+  %tmp47 = phi float [ 0.000000e+00, %bb ], [ %tmp312, %bb145 ]
+  %tmp48 = phi float [ 0.000000e+00, %bb ], [ %tmp347, %bb145 ]
+  %tmp49 = phi float [ 0.000000e+00, %bb ], [ %tmp311, %bb145 ]
+  %tmp50 = phi float [ 0.000000e+00, %bb ], [ %tmp310, %bb145 ]
+  %tmp51 = phi float [ 0.000000e+00, %bb ], [ %tmp309, %bb145 ]
+  %tmp52 = phi float [ 0.000000e+00, %bb ], [ %tmp348, %bb145 ]
+  %tmp53 = phi float [ 0.000000e+00, %bb ], [ %tmp308, %bb145 ]
+  %tmp54 = phi float [ 0.000000e+00, %bb ], [ %tmp307, %bb145 ]
+  %tmp55 = phi float [ 0.000000e+00, %bb ], [ %tmp306, %bb145 ]
+  %tmp56 = phi float [ 0.000000e+00, %bb ], [ %tmp349, %bb145 ]
+  %tmp57 = phi float [ 0.000000e+00, %bb ], [ %tmp305, %bb145 ]
+  %tmp58 = phi float [ 0.000000e+00, %bb ], [ %tmp304, %bb145 ]
+  %tmp59 = phi float [ 0.000000e+00, %bb ], [ %tmp303, %bb145 ]
+  %tmp60 = phi float [ 0.000000e+00, %bb ], [ %tmp350, %bb145 ]
+  %tmp61 = phi float [ 0.000000e+00, %bb ], [ %tmp302, %bb145 ]
+  %tmp62 = phi float [ 0.000000e+00, %bb ], [ %tmp301, %bb145 ]
+  %tmp63 = phi float [ 0.000000e+00, %bb ], [ %tmp300, %bb145 ]
+  %tmp64 = phi float [ 0.000000e+00, %bb ], [ %tmp351, %bb145 ]
+  %tmp65 = phi float [ 0.000000e+00, %bb ], [ %tmp299, %bb145 ]
+  %tmp66 = phi float [ 0.000000e+00, %bb ], [ %tmp298, %bb145 ]
+  %tmp67 = phi float [ 0.000000e+00, %bb ], [ %tmp297, %bb145 ]
+  %tmp68 = phi float [ 0.000000e+00, %bb ], [ %tmp352, %bb145 ]
+  %tmp69 = phi float [ 0.000000e+00, %bb ], [ %tmp296, %bb145 ]
+  %tmp70 = phi float [ 0.000000e+00, %bb ], [ %tmp295, %bb145 ]
+  %tmp71 = phi float [ 0.000000e+00, %bb ], [ %tmp294, %bb145 ]
+  %tmp72 = phi float [ 0.000000e+00, %bb ], [ %tmp353, %bb145 ]
+  %tmp73 = phi float [ 0.000000e+00, %bb ], [ %tmp293, %bb145 ]
+  %tmp74 = phi float [ 0.000000e+00, %bb ], [ %tmp292, %bb145 ]
+  %tmp75 = phi float [ 0.000000e+00, %bb ], [ %tmp291, %bb145 ]
+  %tmp76 = phi float [ 0.000000e+00, %bb ], [ %tmp354, %bb145 ]
+  %tmp77 = phi float [ 0.000000e+00, %bb ], [ %tmp290, %bb145 ]
+  %tmp78 = phi float [ 0.000000e+00, %bb ], [ %tmp289, %bb145 ]
+  %tmp79 = phi float [ 0.000000e+00, %bb ], [ %tmp288, %bb145 ]
+  %tmp80 = phi float [ 0.000000e+00, %bb ], [ %tmp355, %bb145 ]
+  %tmp81 = phi float [ 0.000000e+00, %bb ], [ %tmp287, %bb145 ]
+  %tmp82 = phi float [ 0.000000e+00, %bb ], [ %tmp286, %bb145 ]
+  %tmp83 = phi float [ 0.000000e+00, %bb ], [ %tmp285, %bb145 ]
+  %tmp84 = phi float [ 0.000000e+00, %bb ], [ %tmp356, %bb145 ]
+  %tmp85 = phi float [ 0.000000e+00, %bb ], [ %tmp284, %bb145 ]
+  %tmp86 = phi float [ 0.000000e+00, %bb ], [ %tmp283, %bb145 ]
+  %tmp87 = phi float [ 0.000000e+00, %bb ], [ %tmp282, %bb145 ]
+  %tmp88 = phi float [ 0.000000e+00, %bb ], [ %tmp357, %bb145 ]
+  %tmp89 = phi float [ 0.000000e+00, %bb ], [ %tmp281, %bb145 ]
+  %tmp90 = phi float [ 0.000000e+00, %bb ], [ %tmp280, %bb145 ]
+  %tmp91 = phi float [ 0.000000e+00, %bb ], [ %tmp279, %bb145 ]
+  %tmp92 = phi float [ 0.000000e+00, %bb ], [ %tmp358, %bb145 ]
+  %tmp93 = phi float [ 0.000000e+00, %bb ], [ %tmp359, %bb145 ]
+  %tmp94 = phi float [ 0.000000e+00, %bb ], [ %tmp360, %bb145 ]
+  %tmp95 = phi float [ 0.000000e+00, %bb ], [ %tmp409, %bb145 ]
+  %tmp96 = phi float [ 0.000000e+00, %bb ], [ %tmp361, %bb145 ]
+  %tmp97 = phi float [ 0.000000e+00, %bb ], [ %tmp362, %bb145 ]
+  %tmp98 = phi float [ 0.000000e+00, %bb ], [ %tmp363, %bb145 ]
+  %tmp99 = phi float [ 0.000000e+00, %bb ], [ %tmp364, %bb145 ]
+  %tmp100 = phi float [ 0.000000e+00, %bb ], [ %tmp365, %bb145 ]
+  %tmp101 = phi float [ 0.000000e+00, %bb ], [ %tmp366, %bb145 ]
+  %tmp102 = phi float [ 0.000000e+00, %bb ], [ %tmp367, %bb145 ]
+  %tmp103 = phi float [ 0.000000e+00, %bb ], [ %tmp368, %bb145 ]
+  %tmp104 = phi float [ 0.000000e+00, %bb ], [ %tmp369, %bb145 ]
+  %tmp105 = phi float [ 0.000000e+00, %bb ], [ %tmp370, %bb145 ]
+  %tmp106 = phi float [ 0.000000e+00, %bb ], [ %tmp371, %bb145 ]
+  %tmp107 = phi float [ 0.000000e+00, %bb ], [ %tmp372, %bb145 ]
+  %tmp108 = phi float [ 0.000000e+00, %bb ], [ %tmp373, %bb145 ]
+  %tmp109 = phi float [ 0.000000e+00, %bb ], [ %tmp374, %bb145 ]
+  %tmp110 = phi float [ 0.000000e+00, %bb ], [ %tmp375, %bb145 ]
+  %tmp111 = phi float [ 0.000000e+00, %bb ], [ %tmp376, %bb145 ]
+  %tmp112 = phi float [ 0.000000e+00, %bb ], [ %tmp377, %bb145 ]
+  %tmp113 = phi float [ 0.000000e+00, %bb ], [ %tmp378, %bb145 ]
+  %tmp114 = phi float [ 0.000000e+00, %bb ], [ %tmp379, %bb145 ]
+  %tmp115 = phi float [ 0.000000e+00, %bb ], [ %tmp380, %bb145 ]
+  %tmp116 = phi float [ 0.000000e+00, %bb ], [ %tmp381, %bb145 ]
+  %tmp117 = phi float [ 0.000000e+00, %bb ], [ %tmp382, %bb145 ]
+  %tmp118 = phi float [ 0.000000e+00, %bb ], [ %tmp383, %bb145 ]
+  %tmp119 = phi float [ 0.000000e+00, %bb ], [ %tmp384, %bb145 ]
+  %tmp120 = phi float [ 0.000000e+00, %bb ], [ %tmp385, %bb145 ]
+  %tmp121 = phi float [ 0.000000e+00, %bb ], [ %tmp386, %bb145 ]
+  %tmp122 = phi float [ 0.000000e+00, %bb ], [ %tmp387, %bb145 ]
+  %tmp123 = phi float [ 0.000000e+00, %bb ], [ %tmp388, %bb145 ]
+  %tmp124 = phi float [ 0.000000e+00, %bb ], [ %tmp389, %bb145 ]
+  %tmp125 = phi float [ 0.000000e+00, %bb ], [ %tmp390, %bb145 ]
+  %tmp126 = phi float [ 0.000000e+00, %bb ], [ %tmp391, %bb145 ]
+  %tmp127 = phi float [ 0.000000e+00, %bb ], [ %tmp392, %bb145 ]
+  %tmp128 = phi float [ 0.000000e+00, %bb ], [ %tmp393, %bb145 ]
+  %tmp129 = phi float [ 0.000000e+00, %bb ], [ %tmp394, %bb145 ]
+  %tmp130 = phi float [ 0.000000e+00, %bb ], [ %tmp395, %bb145 ]
+  %tmp131 = phi float [ 0.000000e+00, %bb ], [ %tmp396, %bb145 ]
+  %tmp132 = phi float [ 0.000000e+00, %bb ], [ %tmp397, %bb145 ]
+  %tmp133 = phi float [ 0.000000e+00, %bb ], [ %tmp398, %bb145 ]
+  %tmp134 = phi float [ 0.000000e+00, %bb ], [ %tmp399, %bb145 ]
+  %tmp135 = phi float [ 0.000000e+00, %bb ], [ %tmp400, %bb145 ]
+  %tmp136 = phi float [ 0.000000e+00, %bb ], [ %tmp401, %bb145 ]
+  %tmp137 = phi float [ 0.000000e+00, %bb ], [ %tmp402, %bb145 ]
+  %tmp138 = phi float [ 0.000000e+00, %bb ], [ %tmp403, %bb145 ]
+  %tmp139 = phi float [ 0.000000e+00, %bb ], [ %tmp404, %bb145 ]
+  %tmp140 = phi float [ 0.000000e+00, %bb ], [ %tmp405, %bb145 ]
+  %tmp141 = phi float [ 0.000000e+00, %bb ], [ %tmp406, %bb145 ]
+  %tmp142 = bitcast float %tmp95 to i32
+  %tmp143 = icmp sgt i32 %tmp142, 125
+  br i1 %tmp143, label %bb144, label %bb145
+
+bb144:                                            ; preds = %bb12
+  store volatile float %arg3, float addrspace(1)* %arg
+  store volatile float %tmp91, float addrspace(1)* %arg
+  store volatile float %tmp90, float addrspace(1)* %arg
+  store volatile float %tmp89, float addrspace(1)* %arg
+  store volatile float %tmp87, float addrspace(1)* %arg
+  store volatile float %tmp86, float addrspace(1)* %arg
+  store volatile float %tmp85, float addrspace(1)* %arg
+  store volatile float %tmp83, float addrspace(1)* %arg
+  store volatile float %tmp82, float addrspace(1)* %arg
+  store volatile float %tmp81, float addrspace(1)* %arg
+  store volatile float %tmp79, float addrspace(1)* %arg
+  store volatile float %tmp78, float addrspace(1)* %arg
+  store volatile float %tmp77, float addrspace(1)* %arg
+  store volatile float %tmp75, float addrspace(1)* %arg
+  store volatile float %tmp74, float addrspace(1)* %arg
+  store volatile float %tmp73, float addrspace(1)* %arg
+  store volatile float %tmp71, float addrspace(1)* %arg
+  store volatile float %tmp70, float addrspace(1)* %arg
+  store volatile float %tmp69, float addrspace(1)* %arg
+  store volatile float %tmp67, float addrspace(1)* %arg
+  store volatile float %tmp66, float addrspace(1)* %arg
+  store volatile float %tmp65, float addrspace(1)* %arg
+  store volatile float %tmp63, float addrspace(1)* %arg
+  store volatile float %tmp62, float addrspace(1)* %arg
+  store volatile float %tmp61, float addrspace(1)* %arg
+  store volatile float %tmp59, float addrspace(1)* %arg
+  store volatile float %tmp58, float addrspace(1)* %arg
+  store volatile float %tmp57, float addrspace(1)* %arg
+  store volatile float %tmp55, float addrspace(1)* %arg
+  store volatile float %tmp54, float addrspace(1)* %arg
+  store volatile float %tmp53, float addrspace(1)* %arg
+  store volatile float %tmp51, float addrspace(1)* %arg
+  store volatile float %tmp50, float addrspace(1)* %arg
+  store volatile float %tmp49, float addrspace(1)* %arg
+  store volatile float %tmp47, float addrspace(1)* %arg
+  store volatile float %tmp46, float addrspace(1)* %arg
+  store volatile float %tmp45, float addrspace(1)* %arg
+  store volatile float %tmp43, float addrspace(1)* %arg
+  store volatile float %tmp42, float addrspace(1)* %arg
+  store volatile float %tmp41, float addrspace(1)* %arg
+  store volatile float %tmp39, float addrspace(1)* %arg
+  store volatile float %tmp38, float addrspace(1)* %arg
+  store volatile float %tmp37, float addrspace(1)* %arg
+  store volatile float %tmp35, float addrspace(1)* %arg
+  store volatile float %tmp34, float addrspace(1)* %arg
+  store volatile float %tmp33, float addrspace(1)* %arg
+  store volatile float %tmp31, float addrspace(1)* %arg
+  store volatile float %tmp30, float addrspace(1)* %arg
+  store volatile float %tmp29, float addrspace(1)* %arg
+  store volatile float %tmp27, float addrspace(1)* %arg
+  store volatile float %tmp26, float addrspace(1)* %arg
+  store volatile float %tmp25, float addrspace(1)* %arg
+  store volatile float %tmp23, float addrspace(1)* %arg
+  store volatile float %tmp22, float addrspace(1)* %arg
+  store volatile float %tmp21, float addrspace(1)* %arg
+  store volatile float %tmp19, float addrspace(1)* %arg
+  store volatile float %tmp18, float addrspace(1)* %arg
+  store volatile float %tmp17, float addrspace(1)* %arg
+  store volatile float %tmp15, float addrspace(1)* %arg
+  store volatile float %tmp14, float addrspace(1)* %arg
+  store volatile float %tmp13, float addrspace(1)* %arg
+  store volatile float %tmp16, float addrspace(1)* %arg
+  store volatile float %tmp20, float addrspace(1)* %arg
+  store volatile float %tmp24, float addrspace(1)* %arg
+  store volatile float %tmp28, float addrspace(1)* %arg
+  store volatile float %tmp32, float addrspace(1)* %arg
+  store volatile float %tmp36, float addrspace(1)* %arg
+  store volatile float %tmp40, float addrspace(1)* %arg
+  store volatile float %tmp44, float addrspace(1)* %arg
+  store volatile float %tmp48, float addrspace(1)* %arg
+  store volatile float %tmp52, float addrspace(1)* %arg
+  store volatile float %tmp56, float addrspace(1)* %arg
+  store volatile float %tmp60, float addrspace(1)* %arg
+  store volatile float %tmp64, float addrspace(1)* %arg
+  store volatile float %tmp68, float addrspace(1)* %arg
+  store volatile float %tmp72, float addrspace(1)* %arg
+  store volatile float %tmp76, float addrspace(1)* %arg
+  store volatile float %tmp80, float addrspace(1)* %arg
+  store volatile float %tmp84, float addrspace(1)* %arg
+  store volatile float %tmp88, float addrspace(1)* %arg
+  store volatile float %tmp92, float addrspace(1)* %arg
+  store volatile float %tmp93, float addrspace(1)* %arg
+  store volatile float %tmp94, float addrspace(1)* %arg
+  store volatile float %tmp96, float addrspace(1)* %arg
+  store volatile float %tmp97, float addrspace(1)* %arg
+  store volatile float %tmp98, float addrspace(1)* %arg
+  store volatile float %tmp99, float addrspace(1)* %arg
+  store volatile float %tmp100, float addrspace(1)* %arg
+  store volatile float %tmp101, float addrspace(1)* %arg
+  store volatile float %tmp102, float addrspace(1)* %arg
+  store volatile float %tmp103, float addrspace(1)* %arg
+  store volatile float %tmp104, float addrspace(1)* %arg
+  store volatile float %tmp105, float addrspace(1)* %arg
+  store volatile float %tmp106, float addrspace(1)* %arg
+  store volatile float %tmp107, float addrspace(1)* %arg
+  store volatile float %tmp108, float addrspace(1)* %arg
+  store volatile float %tmp109, float addrspace(1)* %arg
+  store volatile float %tmp110, float addrspace(1)* %arg
+  store volatile float %tmp111, float addrspace(1)* %arg
+  store volatile float %tmp112, float addrspace(1)* %arg
+  store volatile float %tmp113, float addrspace(1)* %arg
+  store volatile float %tmp114, float addrspace(1)* %arg
+  store volatile float %tmp115, float addrspace(1)* %arg
+  store volatile float %tmp116, float addrspace(1)* %arg
+  store volatile float %tmp117, float addrspace(1)* %arg
+  store volatile float %tmp118, float addrspace(1)* %arg
+  store volatile float %tmp119, float addrspace(1)* %arg
+  store volatile float %tmp120, float addrspace(1)* %arg
+  store volatile float %tmp121, float addrspace(1)* %arg
+  store volatile float %tmp122, float addrspace(1)* %arg
+  store volatile float %tmp123, float addrspace(1)* %arg
+  store volatile float %tmp124, float addrspace(1)* %arg
+  store volatile float %tmp125, float addrspace(1)* %arg
+  store volatile float %tmp126, float addrspace(1)* %arg
+  store volatile float %tmp127, float addrspace(1)* %arg
+  store volatile float %tmp128, float addrspace(1)* %arg
+  store volatile float %tmp129, float addrspace(1)* %arg
+  store volatile float %tmp130, float addrspace(1)* %arg
+  store volatile float %tmp131, float addrspace(1)* %arg
+  store volatile float %tmp132, float addrspace(1)* %arg
+  store volatile float %tmp133, float addrspace(1)* %arg
+  store volatile float %tmp134, float addrspace(1)* %arg
+  store volatile float %tmp135, float addrspace(1)* %arg
+  store volatile float %tmp136, float addrspace(1)* %arg
+  store volatile float %tmp137, float addrspace(1)* %arg
+  store volatile float %tmp138, float addrspace(1)* %arg
+  store volatile float %tmp139, float addrspace(1)* %arg
+  store volatile float %arg4, float addrspace(1)* %arg
+  store volatile float %tmp7, float addrspace(1)* %arg
+  store volatile float %tmp8, float addrspace(1)* %arg
+  store volatile float %tmp9, float addrspace(1)* %arg
+  store volatile float %tmp10, float addrspace(1)* %arg
+  ret void
+
+bb145:                                            ; preds = %bb12
+  %tmp146 = bitcast float %tmp95 to i32
+  %tmp147 = bitcast float %tmp95 to i32
+  %tmp148 = add i32 %tmp11, %tmp147
+  %tmp149 = bitcast i32 %tmp148 to float
+  %tmp150 = insertelement <128 x float> undef, float %tmp91, i32 0
+  %tmp151 = insertelement <128 x float> %tmp150, float %tmp90, i32 1
+  %tmp152 = insertelement <128 x float> %tmp151, float %tmp89, i32 2
+  %tmp153 = insertelement <128 x float> %tmp152, float %tmp87, i32 3
+  %tmp154 = insertelement <128 x float> %tmp153, float %tmp86, i32 4
+  %tmp155 = insertelement <128 x float> %tmp154, float %tmp85, i32 5
+  %tmp156 = insertelement <128 x float> %tmp155, float %tmp83, i32 6
+  %tmp157 = insertelement <128 x float> %tmp156, float %tmp82, i32 7
+  %tmp158 = insertelement <128 x float> %tmp157, float %tmp81, i32 8
+  %tmp159 = insertelement <128 x float> %tmp158, float %tmp79, i32 9
+  %tmp160 = insertelement <128 x float> %tmp159, float %tmp78, i32 10
+  %tmp161 = insertelement <128 x float> %tmp160, float %tmp77, i32 11
+  %tmp162 = insertelement <128 x float> %tmp161, float %tmp75, i32 12
+  %tmp163 = insertelement <128 x float> %tmp162, float %tmp74, i32 13
+  %tmp164 = insertelement <128 x float> %tmp163, float %tmp73, i32 14
+  %tmp165 = insertelement <128 x float> %tmp164, float %tmp71, i32 15
+  %tmp166 = insertelement <128 x float> %tmp165, float %tmp70, i32 16
+  %tmp167 = insertelement <128 x float> %tmp166, float %tmp69, i32 17
+  %tmp168 = insertelement <128 x float> %tmp167, float %tmp67, i32 18
+  %tmp169 = insertelement <128 x float> %tmp168, float %tmp66, i32 19
+  %tmp170 = insertelement <128 x float> %tmp169, float %tmp65, i32 20
+  %tmp171 = insertelement <128 x float> %tmp170, float %tmp63, i32 21
+  %tmp172 = insertelement <128 x float> %tmp171, float %tmp62, i32 22
+  %tmp173 = insertelement <128 x float> %tmp172, float %tmp61, i32 23
+  %tmp174 = insertelement <128 x float> %tmp173, float %tmp59, i32 24
+  %tmp175 = insertelement <128 x float> %tmp174, float %tmp58, i32 25
+  %tmp176 = insertelement <128 x float> %tmp175, float %tmp57, i32 26
+  %tmp177 = insertelement <128 x float> %tmp176, float %tmp55, i32 27
+  %tmp178 = insertelement <128 x float> %tmp177, float %tmp54, i32 28
+  %tmp179 = insertelement <128 x float> %tmp178, float %tmp53, i32 29
+  %tmp180 = insertelement <128 x float> %tmp179, float %tmp51, i32 30
+  %tmp181 = insertelement <128 x float> %tmp180, float %tmp50, i32 31
+  %tmp182 = insertelement <128 x float> %tmp181, float %tmp49, i32 32
+  %tmp183 = insertelement <128 x float> %tmp182, float %tmp47, i32 33
+  %tmp184 = insertelement <128 x float> %tmp183, float %tmp46, i32 34
+  %tmp185 = insertelement <128 x float> %tmp184, float %tmp45, i32 35
+  %tmp186 = insertelement <128 x float> %tmp185, float %tmp43, i32 36
+  %tmp187 = insertelement <128 x float> %tmp186, float %tmp42, i32 37
+  %tmp188 = insertelement <128 x float> %tmp187, float %tmp41, i32 38
+  %tmp189 = insertelement <128 x float> %tmp188, float %tmp39, i32 39
+  %tmp190 = insertelement <128 x float> %tmp189, float %tmp38, i32 40
+  %tmp191 = insertelement <128 x float> %tmp190, float %tmp37, i32 41
+  %tmp192 = insertelement <128 x float> %tmp191, float %tmp35, i32 42
+  %tmp193 = insertelement <128 x float> %tmp192, float %tmp34, i32 43
+  %tmp194 = insertelement <128 x float> %tmp193, float %tmp33, i32 44
+  %tmp195 = insertelement <128 x float> %tmp194, float %tmp31, i32 45
+  %tmp196 = insertelement <128 x float> %tmp195, float %tmp30, i32 46
+  %tmp197 = insertelement <128 x float> %tmp196, float %tmp29, i32 47
+  %tmp198 = insertelement <128 x float> %tmp197, float %tmp27, i32 48
+  %tmp199 = insertelement <128 x float> %tmp198, float %tmp26, i32 49
+  %tmp200 = insertelement <128 x float> %tmp199, float %tmp25, i32 50
+  %tmp201 = insertelement <128 x float> %tmp200, float %tmp23, i32 51
+  %tmp202 = insertelement <128 x float> %tmp201, float %tmp22, i32 52
+  %tmp203 = insertelement <128 x float> %tmp202, float %tmp21, i32 53
+  %tmp204 = insertelement <128 x float> %tmp203, float %tmp19, i32 54
+  %tmp205 = insertelement <128 x float> %tmp204, float %tmp18, i32 55
+  %tmp206 = insertelement <128 x float> %tmp205, float %tmp17, i32 56
+  %tmp207 = insertelement <128 x float> %tmp206, float %tmp15, i32 57
+  %tmp208 = insertelement <128 x float> %tmp207, float %tmp14, i32 58
+  %tmp209 = insertelement <128 x float> %tmp208, float %tmp13, i32 59
+  %tmp210 = insertelement <128 x float> %tmp209, float %tmp16, i32 60
+  %tmp211 = insertelement <128 x float> %tmp210, float %tmp20, i32 61
+  %tmp212 = insertelement <128 x float> %tmp211, float %tmp24, i32 62
+  %tmp213 = insertelement <128 x float> %tmp212, float %tmp28, i32 63
+  %tmp214 = insertelement <128 x float> %tmp213, float %tmp32, i32 64
+  %tmp215 = insertelement <128 x float> %tmp214, float %tmp36, i32 65
+  %tmp216 = insertelement <128 x float> %tmp215, float %tmp40, i32 66
+  %tmp217 = insertelement <128 x float> %tmp216, float %tmp44, i32 67
+  %tmp218 = insertelement <128 x float> %tmp217, float %tmp48, i32 68
+  %tmp219 = insertelement <128 x float> %tmp218, float %tmp52, i32 69
+  %tmp220 = insertelement <128 x float> %tmp219, float %tmp56, i32 70
+  %tmp221 = insertelement <128 x float> %tmp220, float %tmp60, i32 71
+  %tmp222 = insertelement <128 x float> %tmp221, float %tmp64, i32 72
+  %tmp223 = insertelement <128 x float> %tmp222, float %tmp68, i32 73
+  %tmp224 = insertelement <128 x float> %tmp223, float %tmp72, i32 74
+  %tmp225 = insertelement <128 x float> %tmp224, float %tmp76, i32 75
+  %tmp226 = insertelement <128 x float> %tmp225, float %tmp80, i32 76
+  %tmp227 = insertelement <128 x float> %tmp226, float %tmp84, i32 77
+  %tmp228 = insertelement <128 x float> %tmp227, float %tmp88, i32 78
+  %tmp229 = insertelement <128 x float> %tmp228, float %tmp92, i32 79
+  %tmp230 = insertelement <128 x float> %tmp229, float %tmp93, i32 80
+  %tmp231 = insertelement <128 x float> %tmp230, float %tmp94, i32 81
+  %tmp232 = insertelement <128 x float> %tmp231, float %tmp96, i32 82
+  %tmp233 = insertelement <128 x float> %tmp232, float %tmp97, i32 83
+  %tmp234 = insertelement <128 x float> %tmp233, float %tmp98, i32 84
+  %tmp235 = insertelement <128 x float> %tmp234, float %tmp99, i32 85
+  %tmp236 = insertelement <128 x float> %tmp235, float %tmp100, i32 86
+  %tmp237 = insertelement <128 x float> %tmp236, float %tmp101, i32 87
+  %tmp238 = insertelement <128 x float> %tmp237, float %tmp102, i32 88
+  %tmp239 = insertelement <128 x float> %tmp238, float %tmp103, i32 89
+  %tmp240 = insertelement <128 x float> %tmp239, float %tmp104, i32 90
+  %tmp241 = insertelement <128 x float> %tmp240, float %tmp105, i32 91
+  %tmp242 = insertelement <128 x float> %tmp241, float %tmp106, i32 92
+  %tmp243 = insertelement <128 x float> %tmp242, float %tmp107, i32 93
+  %tmp244 = insertelement <128 x float> %tmp243, float %tmp108, i32 94
+  %tmp245 = insertelement <128 x float> %tmp244, float %tmp109, i32 95
+  %tmp246 = insertelement <128 x float> %tmp245, float %tmp110, i32 96
+  %tmp247 = insertelement <128 x float> %tmp246, float %tmp111, i32 97
+  %tmp248 = insertelement <128 x float> %tmp247, float %tmp112, i32 98
+  %tmp249 = insertelement <128 x float> %tmp248, float %tmp113, i32 99
+  %tmp250 = insertelement <128 x float> %tmp249, float %tmp114, i32 100
+  %tmp251 = insertelement <128 x float> %tmp250, float %tmp115, i32 101
+  %tmp252 = insertelement <128 x float> %tmp251, float %tmp116, i32 102
+  %tmp253 = insertelement <128 x float> %tmp252, float %tmp117, i32 103
+  %tmp254 = insertelement <128 x float> %tmp253, float %tmp118, i32 104
+  %tmp255 = insertelement <128 x float> %tmp254, float %tmp119, i32 105
+  %tmp256 = insertelement <128 x float> %tmp255, float %tmp120, i32 106
+  %tmp257 = insertelement <128 x float> %tmp256, float %tmp121, i32 107
+  %tmp258 = insertelement <128 x float> %tmp257, float %tmp122, i32 108
+  %tmp259 = insertelement <128 x float> %tmp258, float %tmp123, i32 109
+  %tmp260 = insertelement <128 x float> %tmp259, float %tmp124, i32 110
+  %tmp261 = insertelement <128 x float> %tmp260, float %tmp125, i32 111
+  %tmp262 = insertelement <128 x float> %tmp261, float %tmp126, i32 112
+  %tmp263 = insertelement <128 x float> %tmp262, float %tmp127, i32 113
+  %tmp264 = insertelement <128 x float> %tmp263, float %tmp128, i32 114
+  %tmp265 = insertelement <128 x float> %tmp264, float %tmp129, i32 115
+  %tmp266 = insertelement <128 x float> %tmp265, float %tmp130, i32 116
+  %tmp267 = insertelement <128 x float> %tmp266, float %tmp131, i32 117
+  %tmp268 = insertelement <128 x float> %tmp267, float %tmp132, i32 118
+  %tmp269 = insertelement <128 x float> %tmp268, float %tmp133, i32 119
+  %tmp270 = insertelement <128 x float> %tmp269, float %tmp134, i32 120
+  %tmp271 = insertelement <128 x float> %tmp270, float %tmp135, i32 121
+  %tmp272 = insertelement <128 x float> %tmp271, float %tmp136, i32 122
+  %tmp273 = insertelement <128 x float> %tmp272, float %tmp137, i32 123
+  %tmp274 = insertelement <128 x float> %tmp273, float %tmp138, i32 124
+  %tmp275 = insertelement <128 x float> %tmp274, float %tmp139, i32 125
+  %tmp276 = insertelement <128 x float> %tmp275, float %tmp140, i32 126
+  %tmp277 = insertelement <128 x float> %tmp276, float %tmp141, i32 127
+  %tmp278 = insertelement <128 x float> %tmp277, float %tmp149, i32 %tmp146
+  %tmp279 = extractelement <128 x float> %tmp278, i32 0
+  %tmp280 = extractelement <128 x float> %tmp278, i32 1
+  %tmp281 = extractelement <128 x float> %tmp278, i32 2
+  %tmp282 = extractelement <128 x float> %tmp278, i32 3
+  %tmp283 = extractelement <128 x float> %tmp278, i32 4
+  %tmp284 = extractelement <128 x float> %tmp278, i32 5
+  %tmp285 = extractelement <128 x float> %tmp278, i32 6
+  %tmp286 = extractelement <128 x float> %tmp278, i32 7
+  %tmp287 = extractelement <128 x float> %tmp278, i32 8
+  %tmp288 = extractelement <128 x float> %tmp278, i32 9
+  %tmp289 = extractelement <128 x float> %tmp278, i32 10
+  %tmp290 = extractelement <128 x float> %tmp278, i32 11
+  %tmp291 = extractelement <128 x float> %tmp278, i32 12
+  %tmp292 = extractelement <128 x float> %tmp278, i32 13
+  %tmp293 = extractelement <128 x float> %tmp278, i32 14
+  %tmp294 = extractelement <128 x float> %tmp278, i32 15
+  %tmp295 = extractelement <128 x float> %tmp278, i32 16
+  %tmp296 = extractelement <128 x float> %tmp278, i32 17
+  %tmp297 = extractelement <128 x float> %tmp278, i32 18
+  %tmp298 = extractelement <128 x float> %tmp278, i32 19
+  %tmp299 = extractelement <128 x float> %tmp278, i32 20
+  %tmp300 = extractelement <128 x float> %tmp278, i32 21
+  %tmp301 = extractelement <128 x float> %tmp278, i32 22
+  %tmp302 = extractelement <128 x float> %tmp278, i32 23
+  %tmp303 = extractelement <128 x float> %tmp278, i32 24
+  %tmp304 = extractelement <128 x float> %tmp278, i32 25
+  %tmp305 = extractelement <128 x float> %tmp278, i32 26
+  %tmp306 = extractelement <128 x float> %tmp278, i32 27
+  %tmp307 = extractelement <128 x float> %tmp278, i32 28
+  %tmp308 = extractelement <128 x float> %tmp278, i32 29
+  %tmp309 = extractelement <128 x float> %tmp278, i32 30
+  %tmp310 = extractelement <128 x float> %tmp278, i32 31
+  %tmp311 = extractelement <128 x float> %tmp278, i32 32
+  %tmp312 = extractelement <128 x float> %tmp278, i32 33
+  %tmp313 = extractelement <128 x float> %tmp278, i32 34
+  %tmp314 = extractelement <128 x float> %tmp278, i32 35
+  %tmp315 = extractelement <128 x float> %tmp278, i32 36
+  %tmp316 = extractelement <128 x float> %tmp278, i32 37
+  %tmp317 = extractelement <128 x float> %tmp278, i32 38
+  %tmp318 = extractelement <128 x float> %tmp278, i32 39
+  %tmp319 = extractelement <128 x float> %tmp278, i32 40
+  %tmp320 = extractelement <128 x float> %tmp278, i32 41
+  %tmp321 = extractelement <128 x float> %tmp278, i32 42
+  %tmp322 = extractelement <128 x float> %tmp278, i32 43
+  %tmp323 = extractelement <128 x float> %tmp278, i32 44
+  %tmp324 = extractelement <128 x float> %tmp278, i32 45
+  %tmp325 = extractelement <128 x float> %tmp278, i32 46
+  %tmp326 = extractelement <128 x float> %tmp278, i32 47
+  %tmp327 = extractelement <128 x float> %tmp278, i32 48
+  %tmp328 = extractelement <128 x float> %tmp278, i32 49
+  %tmp329 = extractelement <128 x float> %tmp278, i32 50
+  %tmp330 = extractelement <128 x float> %tmp278, i32 51
+  %tmp331 = extractelement <128 x float> %tmp278, i32 52
+  %tmp332 = extractelement <128 x float> %tmp278, i32 53
+  %tmp333 = extractelement <128 x float> %tmp278, i32 54
+  %tmp334 = extractelement <128 x float> %tmp278, i32 55
+  %tmp335 = extractelement <128 x float> %tmp278, i32 56
+  %tmp336 = extractelement <128 x float> %tmp278, i32 57
+  %tmp337 = extractelement <128 x float> %tmp278, i32 58
+  %tmp338 = extractelement <128 x float> %tmp278, i32 59
+  %tmp339 = extractelement <128 x float> %tmp278, i32 60
+  %tmp340 = extractelement <128 x float> %tmp278, i32 61
+  %tmp341 = extractelement <128 x float> %tmp278, i32 62
+  %tmp342 = extractelement <128 x float> %tmp278, i32 63
+  %tmp343 = extractelement <128 x float> %tmp278, i32 64
+  %tmp344 = extractelement <128 x float> %tmp278, i32 65
+  %tmp345 = extractelement <128 x float> %tmp278, i32 66
+  %tmp346 = extractelement <128 x float> %tmp278, i32 67
+  %tmp347 = extractelement <128 x float> %tmp278, i32 68
+  %tmp348 = extractelement <128 x float> %tmp278, i32 69
+  %tmp349 = extractelement <128 x float> %tmp278, i32 70
+  %tmp350 = extractelement <128 x float> %tmp278, i32 71
+  %tmp351 = extractelement <128 x float> %tmp278, i32 72
+  %tmp352 = extractelement <128 x float> %tmp278, i32 73
+  %tmp353 = extractelement <128 x float> %tmp278, i32 74
+  %tmp354 = extractelement <128 x float> %tmp278, i32 75
+  %tmp355 = extractelement <128 x float> %tmp278, i32 76
+  %tmp356 = extractelement <128 x float> %tmp278, i32 77
+  %tmp357 = extractelement <128 x float> %tmp278, i32 78
+  %tmp358 = extractelement <128 x float> %tmp278, i32 79
+  %tmp359 = extractelement <128 x float> %tmp278, i32 80
+  %tmp360 = extractelement <128 x float> %tmp278, i32 81
+  %tmp361 = extractelement <128 x float> %tmp278, i32 82
+  %tmp362 = extractelement <128 x float> %tmp278, i32 83
+  %tmp363 = extractelement <128 x float> %tmp278, i32 84
+  %tmp364 = extractelement <128 x float> %tmp278, i32 85
+  %tmp365 = extractelement <128 x float> %tmp278, i32 86
+  %tmp366 = extractelement <128 x float> %tmp278, i32 87
+  %tmp367 = extractelement <128 x float> %tmp278, i32 88
+  %tmp368 = extractelement <128 x float> %tmp278, i32 89
+  %tmp369 = extractelement <128 x float> %tmp278, i32 90
+  %tmp370 = extractelement <128 x float> %tmp278, i32 91
+  %tmp371 = extractelement <128 x float> %tmp278, i32 92
+  %tmp372 = extractelement <128 x float> %tmp278, i32 93
+  %tmp373 = extractelement <128 x float> %tmp278, i32 94
+  %tmp374 = extractelement <128 x float> %tmp278, i32 95
+  %tmp375 = extractelement <128 x float> %tmp278, i32 96
+  %tmp376 = extractelement <128 x float> %tmp278, i32 97
+  %tmp377 = extractelement <128 x float> %tmp278, i32 98
+  %tmp378 = extractelement <128 x float> %tmp278, i32 99
+  %tmp379 = extractelement <128 x float> %tmp278, i32 100
+  %tmp380 = extractelement <128 x float> %tmp278, i32 101
+  %tmp381 = extractelement <128 x float> %tmp278, i32 102
+  %tmp382 = extractelement <128 x float> %tmp278, i32 103
+  %tmp383 = extractelement <128 x float> %tmp278, i32 104
+  %tmp384 = extractelement <128 x float> %tmp278, i32 105
+  %tmp385 = extractelement <128 x float> %tmp278, i32 106
+  %tmp386 = extractelement <128 x float> %tmp278, i32 107
+  %tmp387 = extractelement <128 x float> %tmp278, i32 108
+  %tmp388 = extractelement <128 x float> %tmp278, i32 109
+  %tmp389 = extractelement <128 x float> %tmp278, i32 110
+  %tmp390 = extractelement <128 x float> %tmp278, i32 111
+  %tmp391 = extractelement <128 x float> %tmp278, i32 112
+  %tmp392 = extractelement <128 x float> %tmp278, i32 113
+  %tmp393 = extractelement <128 x float> %tmp278, i32 114
+  %tmp394 = extractelement <128 x float> %tmp278, i32 115
+  %tmp395 = extractelement <128 x float> %tmp278, i32 116
+  %tmp396 = extractelement <128 x float> %tmp278, i32 117
+  %tmp397 = extractelement <128 x float> %tmp278, i32 118
+  %tmp398 = extractelement <128 x float> %tmp278, i32 119
+  %tmp399 = extractelement <128 x float> %tmp278, i32 120
+  %tmp400 = extractelement <128 x float> %tmp278, i32 121
+  %tmp401 = extractelement <128 x float> %tmp278, i32 122
+  %tmp402 = extractelement <128 x float> %tmp278, i32 123
+  %tmp403 = extractelement <128 x float> %tmp278, i32 124
+  %tmp404 = extractelement <128 x float> %tmp278, i32 125
+  %tmp405 = extractelement <128 x float> %tmp278, i32 126
+  %tmp406 = extractelement <128 x float> %tmp278, i32 127
+  %tmp407 = bitcast float %tmp95 to i32
+  %tmp408 = add i32 %tmp407, 1
+  %tmp409 = bitcast i32 %tmp408 to float
+  br label %bb12
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 5ce65371b01..ef492cef576 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,7 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling < %s | FileCheck %s
-
-; FIXME: Enable -verify-instructions
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -11,9 +9,19 @@
 
 ; FIXME: The same register is initialized to 0 for every spill.
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: NumVgprs: 256
-; CHECK: ScratchSize: 1024
+; GCN-LABEL: {{^}}main:
+
+; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GCN-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GCN-NEXT: s_mov_b32 s10, -1
+; SI-NEXT: s_mov_b32 s11, 0x80f000
+; VI-NEXT: s_mov_b32 s11, 0x800000
+
+; s12 is offset user SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, s[8:11], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill
+
+; GCN: NumVgprs: 256
+; GCN: ScratchSize: 1024
 
 define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb: