diff options
| -rw-r--r-- | llvm/include/llvm/CodeGen/TargetInstrInfo.h | 22 | ||||
| -rw-r--r-- | llvm/lib/CodeGen/StackSlotColoring.cpp | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 199 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.h | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/pr30821.mir | 133 | 
5 files changed, 289 insertions, 81 deletions
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h index d6fc4482fc1..9e88c08c76c 100644 --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -225,6 +225,17 @@ public:      return 0;    } +  /// Optional extension of isLoadFromStackSlot that returns the number of +  /// bytes loaded from the stack. This must be implemented if a backend +  /// supports partial stack slot spills/loads to further disambiguate +  /// what the load does. +  virtual unsigned isLoadFromStackSlot(const MachineInstr &MI, +                                       int &FrameIndex, +                                       unsigned &MemBytes) const { +    MemBytes = 0; +    return isLoadFromStackSlot(MI, FrameIndex); +  } +    /// Check for post-frame ptr elimination stack locations as well.    /// This uses a heuristic so it isn't reliable for correctness.    virtual unsigned isLoadFromStackSlotPostFE(const MachineInstr &MI, @@ -252,6 +263,17 @@ public:      return 0;    } +  /// Optional extension of isStoreToStackSlot that returns the number of +  /// bytes stored to the stack. This must be implemented if a backend +  /// supports partial stack slot spills/loads to further disambiguate +  /// what the store does. +  virtual unsigned isStoreToStackSlot(const MachineInstr &MI, +                                      int &FrameIndex, +                                      unsigned &MemBytes) const { +    MemBytes = 0; +    return isStoreToStackSlot(MI, FrameIndex); +  } +    /// Check for post-frame ptr elimination stack locations as well.    /// This uses a heuristic, so it isn't reliable for correctness.    virtual unsigned isStoreToStackSlotPostFE(const MachineInstr &MI, diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp index 1d0aa687337..17f6b83a619 100644 --- a/llvm/lib/CodeGen/StackSlotColoring.cpp +++ b/llvm/lib/CodeGen/StackSlotColoring.cpp @@ -418,7 +418,9 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {      unsigned LoadReg = 0;      unsigned StoreReg = 0; -    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS))) +    unsigned LoadSize = 0; +    unsigned StoreSize = 0; +    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS, LoadSize)))        continue;      // Skip the ...pseudo debugging... instructions between a load and store.      while ((NextMI != E) && NextMI->isDebugValue()) { @@ -426,9 +428,11 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {        ++I;      }      if (NextMI == E) continue; -    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS))) +    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS, StoreSize))) +      continue; +    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1 || +        LoadSize != StoreSize)        continue; -    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;      ++NumDead;      changed = true; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index ad92a038107..11ca8b0aa3f 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3939,24 +3939,40 @@ bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,    return false;  } -static bool isFrameLoadOpcode(int Opcode) { +static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {    switch (Opcode) {    default:      return false;    case X86::MOV8rm: +  case X86::KMOVBkm: +    MemBytes = 1; +    return true;    case X86::MOV16rm: +  case X86::KMOVWkm: +    MemBytes = 2; +    return true;    case X86::MOV32rm: +  case X86::MOVSSrm: +  case X86::VMOVSSZrm: +  case X86::KMOVDkm: +    MemBytes = 4; +    return true;    case X86::MOV64rm:    case X86::LD_Fp64m: -  case X86::MOVSSrm:    case X86::MOVSDrm: +  case X86::VMOVSSrm: +  case X86::VMOVSDZrm: +  case X86::MMX_MOVD64rm: +  case X86::MMX_MOVQ64rm: +  case X86::KMOVQkm: +    MemBytes = 8; +    return true;    case X86::MOVAPSrm:    case X86::MOVUPSrm:    case X86::MOVAPDrm:    case X86::MOVUPDrm:    case X86::MOVDQArm:    case X86::MOVDQUrm: -  case X86::VMOVSSrm:    case X86::VMOVSDrm:    case X86::VMOVAPSrm:    case X86::VMOVUPSrm: @@ -3964,131 +3980,142 @@ static bool isFrameLoadOpcode(int Opcode) {    case X86::VMOVUPDrm:    case X86::VMOVDQArm:    case X86::VMOVDQUrm: -  case X86::VMOVUPSYrm: +  case X86::VMOVAPSZ128rm: +  case X86::VMOVUPSZ128rm: +  case X86::VMOVAPSZ128rm_NOVLX: +  case X86::VMOVUPSZ128rm_NOVLX: +  case X86::VMOVAPDZ128rm: +  case X86::VMOVUPDZ128rm: +  case X86::VMOVDQU8Z128rm: +  case X86::VMOVDQU16Z128rm: +  case X86::VMOVDQA32Z128rm: +  case X86::VMOVDQU32Z128rm: +  case X86::VMOVDQA64Z128rm: +  case X86::VMOVDQU64Z128rm: +    MemBytes = 16; +    return true;    case X86::VMOVAPSYrm: -  case X86::VMOVUPDYrm: +  case X86::VMOVUPSYrm:    case X86::VMOVAPDYrm: -  case X86::VMOVDQUYrm: +  case X86::VMOVUPDYrm:    case X86::VMOVDQAYrm: -  case X86::MMX_MOVD64rm: -  case X86::MMX_MOVQ64rm: -  case X86::VMOVSSZrm: -  case X86::VMOVSDZrm: -  case X86::VMOVAPSZrm: -  case X86::VMOVAPSZ128rm: +  case X86::VMOVDQUYrm:    case X86::VMOVAPSZ256rm: -  case X86::VMOVAPSZ128rm_NOVLX: -  case X86::VMOVAPSZ256rm_NOVLX: -  case X86::VMOVUPSZrm: -  case X86::VMOVUPSZ128rm:    case X86::VMOVUPSZ256rm: -  case X86::VMOVUPSZ128rm_NOVLX: +  case X86::VMOVAPSZ256rm_NOVLX:    case X86::VMOVUPSZ256rm_NOVLX: -  case X86::VMOVAPDZrm: -  case X86::VMOVAPDZ128rm:    case X86::VMOVAPDZ256rm: -  case X86::VMOVUPDZrm: -  case X86::VMOVUPDZ128rm:    case X86::VMOVUPDZ256rm: -  case X86::VMOVDQA32Zrm: -  case X86::VMOVDQA32Z128rm: +  case X86::VMOVDQU8Z256rm: +  case X86::VMOVDQU16Z256rm:    case X86::VMOVDQA32Z256rm: -  case X86::VMOVDQU32Zrm: -  case X86::VMOVDQU32Z128rm:    case X86::VMOVDQU32Z256rm: -  case X86::VMOVDQA64Zrm: -  case X86::VMOVDQA64Z128rm:    case X86::VMOVDQA64Z256rm: -  case X86::VMOVDQU64Zrm: -  case X86::VMOVDQU64Z128rm:    case X86::VMOVDQU64Z256rm: +    MemBytes = 32; +    return true; +  case X86::VMOVAPSZrm: +  case X86::VMOVUPSZrm: +  case X86::VMOVAPDZrm: +  case X86::VMOVUPDZrm:    case X86::VMOVDQU8Zrm: -  case X86::VMOVDQU8Z128rm: -  case X86::VMOVDQU8Z256rm:    case X86::VMOVDQU16Zrm: -  case X86::VMOVDQU16Z128rm: -  case X86::VMOVDQU16Z256rm: -  case X86::KMOVBkm: -  case X86::KMOVWkm: -  case X86::KMOVDkm: -  case X86::KMOVQkm: +  case X86::VMOVDQA32Zrm: +  case X86::VMOVDQU32Zrm: +  case X86::VMOVDQA64Zrm: +  case X86::VMOVDQU64Zrm: +    MemBytes = 64;      return true;    }  } -static bool isFrameStoreOpcode(int Opcode) { +static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {    switch (Opcode) { -  default: break; +  default: +    return false;    case X86::MOV8mr: +  case X86::KMOVBmk: +    MemBytes = 1; +    return true;    case X86::MOV16mr: +  case X86::KMOVWmk: +    MemBytes = 2; +    return true;    case X86::MOV32mr: +  case X86::MOVSSmr: +  case X86::VMOVSSmr: +  case X86::VMOVSSZmr: +  case X86::KMOVDmk: +    MemBytes = 4; +    return true;    case X86::MOV64mr:    case X86::ST_FpP64m: -  case X86::MOVSSmr:    case X86::MOVSDmr: +  case X86::VMOVSDmr: +  case X86::VMOVSDZmr: +  case X86::MMX_MOVD64mr: +  case X86::MMX_MOVQ64mr: +  case X86::MMX_MOVNTQmr: +  case X86::KMOVQmk: +    MemBytes = 8; +    return true;    case X86::MOVAPSmr:    case X86::MOVUPSmr:    case X86::MOVAPDmr:    case X86::MOVUPDmr:    case X86::MOVDQAmr:    case X86::MOVDQUmr: -  case X86::VMOVSSmr: -  case X86::VMOVSDmr:    case X86::VMOVAPSmr:    case X86::VMOVUPSmr:    case X86::VMOVAPDmr:    case X86::VMOVUPDmr:    case X86::VMOVDQAmr:    case X86::VMOVDQUmr: +  case X86::VMOVUPSZ128mr: +  case X86::VMOVAPSZ128mr: +  case X86::VMOVUPSZ128mr_NOVLX: +  case X86::VMOVAPSZ128mr_NOVLX: +  case X86::VMOVUPDZ128mr: +  case X86::VMOVAPDZ128mr: +  case X86::VMOVDQA32Z128mr: +  case X86::VMOVDQU32Z128mr: +  case X86::VMOVDQA64Z128mr: +  case X86::VMOVDQU64Z128mr: +  case X86::VMOVDQU8Z128mr: +  case X86::VMOVDQU16Z128mr: +    MemBytes = 16; +    return true;    case X86::VMOVUPSYmr:    case X86::VMOVAPSYmr:    case X86::VMOVUPDYmr:    case X86::VMOVAPDYmr:    case X86::VMOVDQUYmr:    case X86::VMOVDQAYmr: -  case X86::VMOVSSZmr: -  case X86::VMOVSDZmr: -  case X86::VMOVUPSZmr: -  case X86::VMOVUPSZ128mr:    case X86::VMOVUPSZ256mr: -  case X86::VMOVUPSZ128mr_NOVLX: -  case X86::VMOVUPSZ256mr_NOVLX: -  case X86::VMOVAPSZmr: -  case X86::VMOVAPSZ128mr:    case X86::VMOVAPSZ256mr: -  case X86::VMOVAPSZ128mr_NOVLX: +  case X86::VMOVUPSZ256mr_NOVLX:    case X86::VMOVAPSZ256mr_NOVLX: -  case X86::VMOVUPDZmr: -  case X86::VMOVUPDZ128mr:    case X86::VMOVUPDZ256mr: -  case X86::VMOVAPDZmr: -  case X86::VMOVAPDZ128mr:    case X86::VMOVAPDZ256mr: -  case X86::VMOVDQA32Zmr: -  case X86::VMOVDQA32Z128mr: +  case X86::VMOVDQU8Z256mr: +  case X86::VMOVDQU16Z256mr:    case X86::VMOVDQA32Z256mr: -  case X86::VMOVDQU32Zmr: -  case X86::VMOVDQU32Z128mr:    case X86::VMOVDQU32Z256mr: -  case X86::VMOVDQA64Zmr: -  case X86::VMOVDQA64Z128mr:    case X86::VMOVDQA64Z256mr: -  case X86::VMOVDQU64Zmr: -  case X86::VMOVDQU64Z128mr:    case X86::VMOVDQU64Z256mr: +    MemBytes = 32; +    return true; +  case X86::VMOVUPSZmr: +  case X86::VMOVAPSZmr: +  case X86::VMOVUPDZmr: +  case X86::VMOVAPDZmr:    case X86::VMOVDQU8Zmr: -  case X86::VMOVDQU8Z128mr: -  case X86::VMOVDQU8Z256mr:    case X86::VMOVDQU16Zmr: -  case X86::VMOVDQU16Z128mr: -  case X86::VMOVDQU16Z256mr: -  case X86::MMX_MOVD64mr: -  case X86::MMX_MOVQ64mr: -  case X86::MMX_MOVNTQmr: -  case X86::KMOVBmk: -  case X86::KMOVWmk: -  case X86::KMOVDmk: -  case X86::KMOVQmk: +  case X86::VMOVDQA32Zmr: +  case X86::VMOVDQU32Zmr: +  case X86::VMOVDQA64Zmr: +  case X86::VMOVDQU64Zmr: +    MemBytes = 64;      return true;    }    return false; @@ -4096,7 +4123,14 @@ static bool isFrameStoreOpcode(int Opcode) {  unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,                                             int &FrameIndex) const { -  if (isFrameLoadOpcode(MI.getOpcode())) +  unsigned Dummy; +  return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy); +} + +unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI, +                                           int &FrameIndex, +                                           unsigned &MemBytes) const { +  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))      if (MI.getOperand(0).getSubReg() == 0 && isFrameOperand(MI, 1, FrameIndex))        return MI.getOperand(0).getReg();    return 0; @@ -4104,7 +4138,8 @@ unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,  unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,                                                   int &FrameIndex) const { -  if (isFrameLoadOpcode(MI.getOpcode())) { +  unsigned Dummy; +  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {      unsigned Reg;      if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))        return Reg; @@ -4117,7 +4152,14 @@ unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,  unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,                                            int &FrameIndex) const { -  if (isFrameStoreOpcode(MI.getOpcode())) +  unsigned Dummy; +  return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy); +} + +unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI, +                                          int &FrameIndex, +                                          unsigned &MemBytes) const { +  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))      if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&          isFrameOperand(MI, 0, FrameIndex))        return MI.getOperand(X86::AddrNumOperands).getReg(); @@ -4126,7 +4168,8 @@ unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,  unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,                                                  int &FrameIndex) const { -  if (isFrameStoreOpcode(MI.getOpcode())) { +  unsigned Dummy; +  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {      unsigned Reg;      if ((Reg = isStoreToStackSlot(MI, FrameIndex)))        return Reg; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index 5b2799d049f..3abc0ad1458 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -238,6 +238,9 @@ public:    unsigned isLoadFromStackSlot(const MachineInstr &MI,                                 int &FrameIndex) const override; +  unsigned isLoadFromStackSlot(const MachineInstr &MI, +                               int &FrameIndex, +                               unsigned &MemBytes) const override;    /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination    /// stack locations as well.  This uses a heuristic so it isn't    /// reliable for correctness. @@ -246,6 +249,9 @@ public:    unsigned isStoreToStackSlot(const MachineInstr &MI,                                int &FrameIndex) const override; +  unsigned isStoreToStackSlot(const MachineInstr &MI, +                              int &FrameIndex, +                              unsigned &MemBytes) const override;    /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination    /// stack locations as well.  This uses a heuristic so it isn't    /// reliable for correctness. diff --git a/llvm/test/CodeGen/X86/pr30821.mir b/llvm/test/CodeGen/X86/pr30821.mir new file mode 100644 index 00000000000..15a6eb55105 --- /dev/null +++ b/llvm/test/CodeGen/X86/pr30821.mir @@ -0,0 +1,133 @@ +# RUN: llc -x mir < %s -run-pass=greedy,virtregrewriter,stack-slot-coloring | FileCheck %s +--- | +  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +  target triple = "x86_64-unknown-linux-gnu" + +  define dso_local i32 @main() local_unnamed_addr { +  entry: +    ; Dummy IR that just performs some allocas -- the machine IR function +    ; below is what this test is about. +    %alpha = alloca i8, align 1 +    %foxtrot = alloca <2 x double>, align 16 +    %india = alloca <2 x double>, align 16 +    ret i32 0 +  } + +... +--- +name:            main +alignment:       4 +exposesReturnsTwice: false +legalized:       false +regBankSelected: false +selected:        false +failedISel:      false +tracksRegLiveness: true +registers: +liveins: +frameInfo: +  isFrameAddressTaken: false +  isReturnAddressTaken: false +  hasStackMap:     false +  hasPatchPoint:   false +  stackSize:       0 +  offsetAdjustment: 0 +  maxAlignment:    16 +  adjustsStack:    false +  hasCalls:        true +  stackProtector:  '' +  maxCallFrameSize: 4294967295 +  hasOpaqueSPAdjustment: false +  hasVAStart:      false +  hasMustTailInVarArgFunc: false +  localFrameSize:  0 +  savePoint:       '' +  restorePoint:    '' +fixedStack: +stack: +  - { id: 0, name: alpha, type: default, offset: 0, size: 1, alignment: 1, +      stack-id: 0, callee-saved-register: '', callee-saved-restored: true, +      di-variable: '', di-expression: '', di-location: '' } +  - { id: 1, name: foxtrot, type: default, offset: 0, size: 16, alignment: 16, +      stack-id: 0, callee-saved-register: '', callee-saved-restored: true, +      di-variable: '', di-expression: '', di-location: '' } +  - { id: 2, name: india, type: default, offset: 0, size: 16, alignment: 16, +      stack-id: 0, callee-saved-register: '', callee-saved-restored: true, +      di-variable: '', di-expression: '', di-location: '' } +constants: +body:             | +  bb.0.entry: +    ; To trick stack-slot-colouring to run its dead-store-elimination phase, +    ; which is at fault, we need the register allocator to run, and spill in two +    ; places that can have their slots merged. Achieve this by volatile-loading +    ; data into $xmm[0-14] and volatile storing them later, leaving regalloc only +    ; $xmm15 to play with in the middle. +    ; Then, perform two virtreg load-and-store pairs, with the faulty code +    ; sequence in the middle (MOVSDrm then MOVAPDmr on the same slot). The virtreg +    ; gets spilt; the corresponding stack slots merged; and faulty code sequence +    ; eliminated if LLVM is broken. + +    ; Make first 15 $xmm registers live +    $xmm0 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm1 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm2 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm3 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm4 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm5 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm6 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm7 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm8 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm9 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm10 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm11 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm12 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm13 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) +    $xmm14 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + +    ; First vreg load +    %1:vr128 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + +    ; First faulty sequence; %1 spilt +    %12:fr64 = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) +    %13:vr128 = COPY killed %12 +    MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %13 :: (volatile store 16 into %ir.india) +    ; CHECK: renamable $xmm{{[0-9]+}} = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) +    ; CHECK-NEXT: MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed renamable $xmm{{[0-9]+}} :: (volatile store 16 into %ir.india) + +    ; Store %1 to avoid it being optimised out, will result in a load-from-spill +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %1 :: (volatile dereferenceable store 16 into %ir.india) + +    ; That code sequence a second time, to generate a second spill slot that +    ; will get coloured and merged. +    %2:vr128 = MOVUPDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 16 from %ir.india) + +    %22:fr64 = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) +    %23:vr128 = COPY killed %22 +    MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %23 :: (volatile store 16 into %ir.india) + +    ; CHECK: renamable $xmm{{[0-9]+}} = MOVSDrm %stack.2.india, 1, $noreg, 0, $noreg :: (volatile dereferenceable load 8 from %ir.india) +    ; CHECK-NEXT: MOVAPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed renamable $xmm{{[0-9]+}} :: (volatile store 16 into %ir.india) + +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed %2 :: (volatile dereferenceable store 16 into %ir.india) + +    ; Stores of first 15 $xmm registers to keep them live across the middle of +    ; this bb. +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm0 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm1 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm2 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm3 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm4 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm5 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm6 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm7 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm8 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm9 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm10 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm11 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm12 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm13 :: (volatile dereferenceable store 16 into %ir.india) +    MOVUPDmr %stack.2.india, 1, $noreg, 0, $noreg, killed $xmm14 :: (volatile dereferenceable store 16 into %ir.india) + +    RET 0 + +...  | 

