diff options
-rw-r--r-- | llvm/lib/Target/X86/X86FastISel.cpp | 32 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 13 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrCompiler.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 23 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/GlobalISel/constant.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avg.ll | 455 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/crash-O0.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/hoist-spill.ll | 2 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/machine-cse.ll | 15 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/madd.ll | 66 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/mmx-arith.ll | 11 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr32284.ll | 19 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/pr32340.ll | 25 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/scheduler-backtracking.ll | 212 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/spill-zero-x86_64.ll | 75 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/swifterror.ll | 21 |
17 files changed, 472 insertions, 523 deletions
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index b87f4802473..a49ad8bd59d 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -1916,8 +1916,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { { &X86::GR64RegClass, X86::RAX, X86::RDX, { { X86::IDIV64r, X86::CQO, Copy, X86::RAX, S }, // SDiv { X86::IDIV64r, X86::CQO, Copy, X86::RDX, S }, // SRem - { X86::DIV64r, X86::MOV64r0, Copy, X86::RAX, U }, // UDiv - { X86::DIV64r, X86::MOV64r0, Copy, X86::RDX, U }, // URem + { X86::DIV64r, X86::MOV32r0, Copy, X86::RAX, U }, // UDiv + { X86::DIV64r, X86::MOV32r0, Copy, X86::RDX, U }, // URem } }, // i64 }; @@ -1964,22 +1964,26 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpSignExtend)); else { - unsigned ZeroReg = createResultReg(VT == MVT::i64 ? &X86::GR64RegClass - : &X86::GR32RegClass); + unsigned Zero32 = createResultReg(&X86::GR32RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(OpEntry.OpSignExtend), ZeroReg); + TII.get(X86::MOV32r0), Zero32); // Copy the zero into the appropriate sub/super/identical physical // register. Unfortunately the operations needed are not uniform enough // to fit neatly into the table above. - if (VT == MVT::i16) + if (VT == MVT::i16) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) - .addReg(ZeroReg, 0, X86::sub_16bit); - else + .addReg(Zero32, 0, X86::sub_16bit); + } else if (VT == MVT::i32) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Copy), TypeEntry.HighInReg) - .addReg(ZeroReg); + .addReg(Zero32); + } else if (VT == MVT::i64) { + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), TypeEntry.HighInReg) + .addImm(0).addReg(Zero32).addImm(X86::sub_32bit); + } } } // Generate the DIV/IDIV instruction. @@ -3704,9 +3708,6 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { uint64_t Imm = CI->getZExtValue(); if (Imm == 0) { - if (VT.SimpleTy == MVT::i64) - return fastEmitInst_(X86::MOV64r0, &X86::GR64RegClass); - unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass); switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type"); @@ -3719,6 +3720,13 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) { X86::sub_16bit); case MVT::i32: return SrcReg; + case MVT::i64: { + unsigned ResultReg = createResultReg(&X86::GR64RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) + .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit); + return ResultReg; + } } } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 717ecc031c0..16819f4451c 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -3591,10 +3591,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. - unsigned ClrOpc = NVT.SimpleTy == MVT::i64 ? X86::MOV64r0 - : X86::MOV32r0; - MVT ClrVT = NVT.SimpleTy == MVT::i64 ? MVT::i64 : MVT::i32; - SDValue ClrNode = SDValue(CurDAG->getMachineNode(ClrOpc, dl, ClrVT), 0); + SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = @@ -3605,7 +3602,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) { 0); break; case MVT::i32: + break; case MVT::i64: + ClrNode = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, + CurDAG->getTargetConstant(X86::sub_32bit, dl, + MVT::i32)), + 0); break; default: llvm_unreachable("Unexpected division source"); diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td index 2805517b747..71b43a38dc2 100644 --- a/llvm/lib/Target/X86/X86InstrCompiler.td +++ b/llvm/lib/Target/X86/X86InstrCompiler.td @@ -270,18 +270,16 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), "", []>; // Alias instruction mapping movr0 to xor. // FIXME: remove when we can teach regalloc that xor reg, reg is ok. let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, AddedComplexity = 10 in { + isPseudo = 1, AddedComplexity = 10 in def MOV32r0 : I<0, Pseudo, (outs GR32:$dst), (ins), "", [(set GR32:$dst, 0)]>, Sched<[WriteZero]>; -def MOV64r0 : I<0, Pseudo, (outs GR64:$dst), (ins), "", - [(set GR64:$dst, 0)]>, Sched<[WriteZero]>; -} // Other widths can also make use of the 32-bit xor, which may have a smaller // encoding and avoid partial register updates. let AddedComplexity = 10 in { def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>; def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; +def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>; } let Predicates = [OptForSize, Not64BitMode], diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 88f2f0fffd6..ae45301f04b 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -683,10 +683,8 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, if (ClobbersEFLAGS && !isSafeToClobberEFLAGS(MBB, I)) { // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side // effects. - unsigned NewOpc = X86::MOV32ri; int Value; switch (Orig.getOpcode()) { - case X86::MOV64r0: NewOpc = X86::MOV32ri64; Value = 0; break; case X86::MOV32r0: Value = 0; break; case X86::MOV32r1: Value = 1; break; case X86::MOV32r_1: Value = -1; break; @@ -695,7 +693,7 @@ void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB, } const DebugLoc &DL = Orig.getDebugLoc(); - BuildMI(MBB, I, DL, get(NewOpc)) + BuildMI(MBB, I, DL, get(X86::MOV32ri)) .add(Orig.getOperand(0)) .addImm(Value); } else { @@ -3752,9 +3750,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, // MOV32r0 etc. are implemented with xor which clobbers condition code. // They are safe to move up, if the definition to EFLAGS is dead and // earlier instructions do not read or write EFLAGS. - if (!Movr0Inst && - (Instr.getOpcode() == X86::MOV32r0 || - Instr.getOpcode() == X86::MOV64r0) && + if (!Movr0Inst && Instr.getOpcode() == X86::MOV32r0 && Instr.registerDefIsDead(X86::EFLAGS, TRI)) { Movr0Inst = &Instr; continue; @@ -4159,15 +4155,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { switch (MI.getOpcode()) { case X86::MOV32r0: return Expand2AddrUndef(MIB, get(X86::XOR32rr)); - case X86::MOV64r0: { - const TargetRegisterInfo *TRI = &getRegisterInfo(); - unsigned Reg = MIB->getOperand(0).getReg(); - unsigned Reg32 = TRI->getSubReg(Reg, X86::sub_32bit); - MIB->getOperand(0).setReg(Reg32); - Expand2AddrUndef(MIB, get(X86::XOR32rr)); - MIB.addReg(Reg, RegState::ImplicitDefine); - return true; - } case X86::MOV32r1: return expandMOV32r1(MIB, *this, /*MinusOne=*/ false); case X86::MOV32r_1: @@ -4911,10 +4898,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( isTwoAddrFold = true; } else { if (OpNum == 0) { - if (MI.getOpcode() == X86::MOV32r0 || MI.getOpcode() == X86::MOV64r0) { - unsigned NewOpc = MI.getOpcode() == X86::MOV64r0 ? X86::MOV64mi32 - : X86::MOV32mi; - NewMI = MakeM0Inst(*this, NewOpc, MOs, InsertPt, MI); + if (MI.getOpcode() == X86::MOV32r0) { + NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI); if (NewMI) return NewMI; } diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp index 20997ecc07d..14e4c455a08 100644 --- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -487,14 +487,20 @@ bool X86SpeculativeLoadHardeningPass::runOnMachineFunction( // Otherwise, just build the predicate state itself by zeroing a register // as we don't need any initial state. PS->InitialReg = MRI->createVirtualRegister(PS->RC); - auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64r0), - PS->InitialReg); + unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); + auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0), + PredStateSubReg); ++NumInstsInserted; MachineOperand *ZeroEFLAGSDefOp = ZeroI->findRegisterDefOperand(X86::EFLAGS); assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() && "Must have an implicit def of EFLAGS!"); ZeroEFLAGSDefOp->setIsDead(true); + BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG), + PS->InitialReg) + .addImm(0) + .addReg(PredStateSubReg) + .addImm(X86::sub_32bit); } // We're going to need to trace predicate state throughout the function's diff --git a/llvm/test/CodeGen/X86/GlobalISel/constant.ll b/llvm/test/CodeGen/X86/GlobalISel/constant.ll index 2043c60f499..f6ebb70fcf5 100644 --- a/llvm/test/CodeGen/X86/GlobalISel/constant.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/constant.ll @@ -54,7 +54,7 @@ define i64 @const_i64_i32() { define void @main(i32 ** %data) { ; ALL-LABEL: main: ; ALL: # %bb.0: -; ALL-NEXT: xorl %eax, %eax +; ALL-NEXT: movq $0, %rax ; ALL-NEXT: movq %rax, (%rdi) ; ALL-NEXT: retq store i32* null, i32** %data, align 8 diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index c4b15070bad..84f1296d51c 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -2141,7 +2141,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: pushq %r13 ; AVX1-NEXT: pushq %r12 ; AVX1-NEXT: pushq %rbx -; AVX1-NEXT: subq $16, %rsp +; AVX1-NEXT: subq $24, %rsp ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero @@ -2152,12 +2152,12 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vmovq %xmm5, %rbp ; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %rcx -; AVX1-NEXT: vmovq %xmm4, %rsi +; AVX1-NEXT: vpextrq $1, %xmm4, %rsi +; AVX1-NEXT: vmovq %xmm4, %rcx ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %r10 +; AVX1-NEXT: vpextrq $1, %xmm4, %r8 ; AVX1-NEXT: vmovq %xmm4, %r11 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero @@ -2166,7 +2166,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero ; AVX1-NEXT: vpextrq $1, %xmm4, %r15 -; AVX1-NEXT: vmovq %xmm4, %rdx +; AVX1-NEXT: vmovq %xmm4, %rdi ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero ; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill @@ -2175,28 +2175,27 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpextrq $1, %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: vmovq %xmm3, %r9 +; AVX1-NEXT: vmovq %xmm3, %r10 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm4 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm4, %r14 -; AVX1-NEXT: addq %rbx, %r14 -; AVX1-NEXT: vmovq %xmm4, %r8 -; AVX1-NEXT: addq %rbp, %r8 +; AVX1-NEXT: vpextrq $1, %xmm4, %rdx +; AVX1-NEXT: addq %rbx, %rdx +; AVX1-NEXT: vmovq %xmm4, %r9 +; AVX1-NEXT: addq %rbp, %r9 ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, %rdi -; AVX1-NEXT: addq %rcx, %rdi -; AVX1-NEXT: vmovq %xmm3, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: addq %rsi, %rax -; AVX1-NEXT: movq %rax, %rsi +; AVX1-NEXT: movq %rax, %r14 +; AVX1-NEXT: vmovq %xmm3, %rbp +; AVX1-NEXT: addq %rcx, %rbp ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero -; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: addq %r10, %rax -; AVX1-NEXT: movq %rax, %r10 +; AVX1-NEXT: vpextrq $1, %xmm3, %rsi +; AVX1-NEXT: addq %r8, %rsi ; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: addq %r11, %rax ; AVX1-NEXT: movq %rax, %r11 @@ -2204,17 +2203,17 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: addq %r13, %rax -; AVX1-NEXT: movq %rax, %rbx +; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: addq %r12, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq %rax, %r8 ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero ; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: addq %r15, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq %rax, %rbx ; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: addq %rdx, %rax +; AVX1-NEXT: addq %rdi, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero @@ -2227,40 +2226,41 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpextrq $1, %xmm2, %rbp -; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload -; AVX1-NEXT: vmovq %xmm2, %r15 -; AVX1-NEXT: addq %r9, %r15 +; AVX1-NEXT: vpextrq $1, %xmm2, %rax +; AVX1-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vmovq %xmm2, %r12 +; AVX1-NEXT: addq %r10, %r12 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] ; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpextrq $1, %xmm0, %r9 -; AVX1-NEXT: addq %rax, %r9 -; AVX1-NEXT: vmovq %xmm1, %rcx -; AVX1-NEXT: vmovq %xmm0, %rdx -; AVX1-NEXT: addq %rcx, %rdx -; AVX1-NEXT: addq $-1, %r14 -; AVX1-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: vpextrq $1, %xmm0, %r10 +; AVX1-NEXT: addq %rax, %r10 +; AVX1-NEXT: vmovq %xmm1, %rax +; AVX1-NEXT: vmovq %xmm0, %rdi +; AVX1-NEXT: addq %rax, %rdi +; AVX1-NEXT: addq $-1, %rdx +; AVX1-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %r8 -; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r9 +; AVX1-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: movq %rax, (%rsp) # 8-byte Spill -; AVX1-NEXT: addq $-1, %rdi -; AVX1-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %r14 +; AVX1-NEXT: movq %r14, (%rsp) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %rsi -; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rbp +; AVX1-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: addq $-1, %r10 -; AVX1-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rsi +; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -2269,90 +2269,93 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: addq $-1, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %ebp +; AVX1-NEXT: adcq $-1, %rbp +; AVX1-NEXT: addq $-1, %r8 +; AVX1-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movl $0, %r15d +; AVX1-NEXT: adcq $-1, %r15 ; AVX1-NEXT: addq $-1, %rbx ; AVX1-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX1-NEXT: movl $0, %eax ; AVX1-NEXT: adcq $-1, %rax ; AVX1-NEXT: movq %rax, %rsi ; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: movl $0, %r12d -; AVX1-NEXT: adcq $-1, %r12 -; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %r13d ; AVX1-NEXT: adcq $-1, %r13 ; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX1-NEXT: movl $0, %r14d ; AVX1-NEXT: adcq $-1, %r14 -; AVX1-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX1-NEXT: movl $0, %ebx -; AVX1-NEXT: adcq $-1, %rbx -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: addq $-1, %rcx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload +; AVX1-NEXT: addq $-1, %rdx ; AVX1-NEXT: movl $0, %r11d ; AVX1-NEXT: adcq $-1, %r11 -; AVX1-NEXT: addq $-1, %rbp -; AVX1-NEXT: movl $0, %r10d -; AVX1-NEXT: adcq $-1, %r10 -; AVX1-NEXT: addq $-1, %r15 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: addq $-1, %rax +; AVX1-NEXT: movl $0, %ebx +; AVX1-NEXT: adcq $-1, %rbx +; AVX1-NEXT: addq $-1, %r12 +; AVX1-NEXT: movl $0, %r9d +; AVX1-NEXT: adcq $-1, %r9 +; AVX1-NEXT: addq $-1, %r10 ; AVX1-NEXT: movl $0, %r8d ; AVX1-NEXT: adcq $-1, %r8 -; AVX1-NEXT: addq $-1, %r9 -; AVX1-NEXT: movl $0, %edi -; AVX1-NEXT: adcq $-1, %rdi -; AVX1-NEXT: addq $-1, %rdx -; AVX1-NEXT: movl $0, %eax -; AVX1-NEXT: adcq $-1, %rax -; AVX1-NEXT: shldq $63, %rdx, %rax -; AVX1-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX1-NEXT: shldq $63, %r9, %rdi -; AVX1-NEXT: shldq $63, %r15, %r8 -; AVX1-NEXT: shldq $63, %rbp, %r10 -; AVX1-NEXT: shldq $63, %rcx, %r11 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %rbx +; AVX1-NEXT: addq $-1, %rdi +; AVX1-NEXT: movl $0, %ecx +; AVX1-NEXT: adcq $-1, %rcx +; AVX1-NEXT: shldq $63, %rdi, %rcx +; AVX1-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: shldq $63, %r10, %r8 +; AVX1-NEXT: shldq $63, %r12, %r9 +; AVX1-NEXT: shldq $63, %rax, %rbx +; AVX1-NEXT: shldq $63, %rdx, %r11 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX1-NEXT: shldq $63, %rdx, %r14 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX1-NEXT: shldq $63, %rdx, %r13 -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %r12 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shldq $63, %rax, %rsi ; AVX1-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shldq $63, %rax, %r15 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shldq $63, %rax, %rbp ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX1-NEXT: shldq $63, %rax, %rsi -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: shldq $63, %rax, %r15 +; AVX1-NEXT: shldq $63, %rax, %rcx +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rcx, %rax -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rcx, %r9 -; AVX1-NEXT: movq (%rsp), %rcx # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: shldq $63, %rdx, %rcx +; AVX1-NEXT: shldq $63, %rax, %rdi +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX1-NEXT: movq (%rsp), %rax # 8-byte Reload +; AVX1-NEXT: shldq $63, %rax, %r12 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shldq $63, %rax, %r10 ; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX1-NEXT: shldq $63, %rbp, %rdx -; AVX1-NEXT: vmovq %rdx, %xmm8 -; AVX1-NEXT: vmovq %rcx, %xmm0 -; AVX1-NEXT: vmovq %r9, %xmm1 -; AVX1-NEXT: vmovq %rax, %xmm11 -; AVX1-NEXT: vmovq %r15, %xmm2 +; AVX1-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX1-NEXT: shldq $63, %rdx, %rax +; AVX1-NEXT: vmovq %rax, %xmm8 +; AVX1-NEXT: vmovq %r10, %xmm0 +; AVX1-NEXT: vmovq %r12, %xmm1 +; AVX1-NEXT: vmovq %rdi, %xmm11 +; AVX1-NEXT: vmovq %rcx, %xmm2 ; AVX1-NEXT: vmovq %rsi, %xmm13 -; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 8-byte Folded Reload -; AVX1-NEXT: # xmm14 = mem[0],zero -; AVX1-NEXT: vmovq %r12, %xmm15 -; AVX1-NEXT: vmovq %r13, %xmm9 -; AVX1-NEXT: vmovq %r14, %xmm10 -; AVX1-NEXT: vmovq %rbx, %xmm12 +; AVX1-NEXT: vmovq %rbp, %xmm14 +; AVX1-NEXT: vmovq %r15, %xmm15 +; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 8-byte Folded Reload +; AVX1-NEXT: # xmm9 = mem[0],zero +; AVX1-NEXT: vmovq %r13, %xmm10 +; AVX1-NEXT: vmovq %r14, %xmm12 ; AVX1-NEXT: vmovq %r11, %xmm3 -; AVX1-NEXT: vmovq %r10, %xmm4 -; AVX1-NEXT: vmovq %r8, %xmm5 -; AVX1-NEXT: vmovq %rdi, %xmm6 +; AVX1-NEXT: vmovq %rbx, %xmm4 +; AVX1-NEXT: vmovq %r9, %xmm5 +; AVX1-NEXT: vmovq %r8, %xmm6 ; AVX1-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload ; AVX1-NEXT: # xmm7 = mem[0],zero ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm0[0],xmm8[0] @@ -2379,7 +2382,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 ; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX1-NEXT: vmovdqu %xmm0, (%rax) -; AVX1-NEXT: addq $16, %rsp +; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: popq %rbx ; AVX1-NEXT: popq %r12 ; AVX1-NEXT: popq %r13 @@ -2404,15 +2407,15 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 ; AVX2-NEXT: vpextrq $1, %xmm4, %rbx -; AVX2-NEXT: vmovq %xmm4, %rdx +; AVX2-NEXT: vmovq %xmm4, %rbp ; AVX2-NEXT: vpextrq $1, %xmm3, %rdi ; AVX2-NEXT: vmovq %xmm3, %rcx ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX2-NEXT: vpextrq $1, %xmm3, %r9 -; AVX2-NEXT: vmovq %xmm3, %r10 -; AVX2-NEXT: vpextrq $1, %xmm2, %r13 +; AVX2-NEXT: vpextrq $1, %xmm3, %rdx +; AVX2-NEXT: vmovq %xmm3, %r9 +; AVX2-NEXT: vpextrq $1, %xmm2, %r11 ; AVX2-NEXT: vmovq %xmm2, %r12 ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero @@ -2430,26 +2433,26 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-NEXT: vpextrq $1, %xmm4, %rbp -; AVX2-NEXT: addq %rbx, %rbp -; AVX2-NEXT: vmovq %xmm4, %rax -; AVX2-NEXT: addq %rdx, %rax -; AVX2-NEXT: movq %rax, %r11 -; AVX2-NEXT: vpextrq $1, %xmm3, %r8 -; AVX2-NEXT: addq %rdi, %r8 +; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: addq %rbx, %rax +; AVX2-NEXT: movq %rax, %rbx +; AVX2-NEXT: vmovq %xmm4, %r13 +; AVX2-NEXT: addq %rbp, %r13 +; AVX2-NEXT: vpextrq $1, %xmm3, %r10 +; AVX2-NEXT: addq %rdi, %r10 ; AVX2-NEXT: vmovq %xmm3, %r14 ; AVX2-NEXT: addq %rcx, %r14 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: addq %r9, %rax -; AVX2-NEXT: movq %rax, %rbx -; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: addq %r10, %rax -; AVX2-NEXT: movq %rax, %r10 -; AVX2-NEXT: vpextrq $1, %xmm2, %rcx -; AVX2-NEXT: addq %r13, %rcx +; AVX2-NEXT: addq %rdx, %rax +; AVX2-NEXT: movq %rax, %rcx +; AVX2-NEXT: vmovq %xmm3, %r8 +; AVX2-NEXT: addq %r9, %r8 +; AVX2-NEXT: vpextrq $1, %xmm2, %rax +; AVX2-NEXT: addq %r11, %rax +; AVX2-NEXT: movq %rax, %r11 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: addq %r12, %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -2471,8 +2474,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vpextrq $1, %xmm2, %r12 -; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload +; AVX2-NEXT: vpextrq $1, %xmm2, %rbp +; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload ; AVX2-NEXT: vmovq %xmm2, %r9 ; AVX2-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload ; AVX2-NEXT: vpextrq $1, %xmm1, %rax @@ -2481,36 +2484,36 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vmovq %xmm1, %rdx ; AVX2-NEXT: vmovq %xmm0, %rsi ; AVX2-NEXT: addq %rdx, %rsi -; AVX2-NEXT: addq $-1, %rbp -; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %rbx +; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r11 -; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r13 +; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r8 -; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r10 +; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: addq $-1, %r14 ; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: movl $0, %ebp -; AVX2-NEXT: adcq $-1, %rbp -; AVX2-NEXT: addq $-1, %rbx -; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movl $0, %r13d +; AVX2-NEXT: adcq $-1, %r13 +; AVX2-NEXT: addq $-1, %rcx +; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %eax ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX2-NEXT: addq $-1, %r10 -; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r8 +; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %r15d ; AVX2-NEXT: adcq $-1, %r15 -; AVX2-NEXT: addq $-1, %rcx -; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: addq $-1, %r11 +; AVX2-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: movl $0, %ebx ; AVX2-NEXT: adcq $-1, %rbx ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill @@ -2525,13 +2528,13 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: adcq $-1, %rax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX2-NEXT: movl $0, %r13d -; AVX2-NEXT: adcq $-1, %r13 +; AVX2-NEXT: movl $0, %r12d +; AVX2-NEXT: adcq $-1, %r12 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: addq $-1, %rcx ; AVX2-NEXT: movl $0, %r11d ; AVX2-NEXT: adcq $-1, %r11 -; AVX2-NEXT: addq $-1, %r12 +; AVX2-NEXT: addq $-1, %rbp ; AVX2-NEXT: movl $0, %r14d ; AVX2-NEXT: adcq $-1, %r14 ; AVX2-NEXT: addq $-1, %r9 @@ -2547,10 +2550,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX2-NEXT: shldq $63, %rdi, %rdx ; AVX2-NEXT: shldq $63, %r9, %r10 -; AVX2-NEXT: shldq $63, %r12, %r14 +; AVX2-NEXT: shldq $63, %rbp, %r14 ; AVX2-NEXT: shldq $63, %rcx, %r11 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r13 +; AVX2-NEXT: shldq $63, %rcx, %r12 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shldq $63, %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload @@ -2566,10 +2569,10 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shldq $63, %rcx, %rax ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %rbp -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: shldq $63, %rcx, %r13 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload -; AVX2-NEXT: shldq $63, %rcx, %r12 +; AVX2-NEXT: shldq $63, %rcx, %rbp ; AVX2-NEXT: movq (%rsp), %rdi # 8-byte Reload ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX2-NEXT: shldq $63, %rcx, %rdi @@ -2578,8 +2581,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: shldq $63, %rcx, %rsi ; AVX2-NEXT: vmovq %rsi, %xmm8 ; AVX2-NEXT: vmovq %rdi, %xmm9 -; AVX2-NEXT: vmovq %r12, %xmm10 -; AVX2-NEXT: vmovq %rbp, %xmm11 +; AVX2-NEXT: vmovq %rbp, %xmm10 +; AVX2-NEXT: vmovq %r13, %xmm11 ; AVX2-NEXT: vmovq %rax, %xmm12 ; AVX2-NEXT: vmovq %r15, %xmm13 ; AVX2-NEXT: vmovq %rbx, %xmm14 @@ -2587,7 +2590,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX2-NEXT: vmovq %r9, %xmm0 ; AVX2-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Folded Reload ; AVX2-NEXT: # xmm1 = mem[0],zero -; AVX2-NEXT: vmovq %r13, %xmm2 +; AVX2-NEXT: vmovq %r12, %xmm2 ; AVX2-NEXT: vmovq %r11, %xmm3 ; AVX2-NEXT: vmovq %r14, %xmm4 ; AVX2-NEXT: vmovq %r10, %xmm5 @@ -2644,7 +2647,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: pushq %r13 ; AVX512-NEXT: pushq %r12 ; AVX512-NEXT: pushq %rbx -; AVX512-NEXT: subq $16, %rsp +; AVX512-NEXT: subq $24, %rsp ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero @@ -2657,8 +2660,8 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: vmovq %xmm3, %rsi ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpextrq $1, %xmm3, %rcx -; AVX512-NEXT: vmovq %xmm3, %r10 +; AVX512-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512-NEXT: vmovq %xmm3, %r8 ; AVX512-NEXT: vpextrq $1, %xmm2, %r13 ; AVX512-NEXT: vmovq %xmm2, %r12 ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 @@ -2666,7 +2669,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpextrq $1, %xmm3, %rdx +; AVX512-NEXT: vpextrq $1, %xmm3, %r15 ; AVX512-NEXT: vmovq %xmm3, %r14 ; AVX512-NEXT: vpextrq $1, %xmm2, %r9 ; AVX512-NEXT: vmovq %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill @@ -2678,34 +2681,35 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX512-NEXT: vpextrq $1, %xmm4, %r11 -; AVX512-NEXT: addq %rbx, %r11 +; AVX512-NEXT: vpextrq $1, %xmm4, %rax +; AVX512-NEXT: addq %rbx, %rax +; AVX512-NEXT: movq %rax, %rbx ; AVX512-NEXT: vmovq %xmm4, %rax ; AVX512-NEXT: addq %rbp, %rax -; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: vpextrq $1, %xmm3, %r8 -; AVX512-NEXT: addq %rdi, %r8 -; AVX512-NEXT: vmovq %xmm3, %r15 -; AVX512-NEXT: addq %rsi, %r15 +; AVX512-NEXT: movq %rax, %rbp +; AVX512-NEXT: vpextrq $1, %xmm3, %rax +; AVX512-NEXT: addq %rdi, %rax +; AVX512-NEXT: movq %rax, %rdi +; AVX512-NEXT: vmovq %xmm3, %r10 +; AVX512-NEXT: addq %rsi, %r10 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 -; AVX512-NEXT: vpextrq $1, %xmm3, %rdi -; AVX512-NEXT: addq %rcx, %rdi +; AVX512-NEXT: vpextrq $1, %xmm3, %rcx +; AVX512-NEXT: addq %rdx, %rcx ; AVX512-NEXT: vmovq %xmm3, %rax -; AVX512-NEXT: addq %r10, %rax -; AVX512-NEXT: movq %rax, %r10 +; AVX512-NEXT: addq %r8, %rax +; AVX512-NEXT: movq %rax, %r8 ; AVX512-NEXT: vpextrq $1, %xmm2, %rsi ; AVX512-NEXT: addq %r13, %rsi -; AVX512-NEXT: vmovq %xmm2, %rax -; AVX512-NEXT: addq %r12, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: vmovq %xmm2, %r11 +; AVX512-NEXT: addq %r12, %r11 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 ; AVX512-NEXT: vpextrq $1, %xmm3, %rax -; AVX512-NEXT: addq %rdx, %rax +; AVX512-NEXT: addq %r15, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vmovq %xmm3, %rax ; AVX512-NEXT: addq %r14, %rax @@ -2718,33 +2722,24 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX512-NEXT: vpextrq $1, %xmm2, %rbp -; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload +; AVX512-NEXT: vpextrq $1, %xmm2, %rax +; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: vmovq %xmm2, %r14 ; AVX512-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload ; AVX512-NEXT: vpextrq $1, %xmm0, %rax ; AVX512-NEXT: vpextrq $1, %xmm1, %r9 ; AVX512-NEXT: addq %rax, %r9 -; AVX512-NEXT: vmovq %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm0, %rax ; AVX512-NEXT: vmovq %xmm1, %rdx -; AVX512-NEXT: addq %rcx, %rdx -; AVX512-NEXT: addq $-1, %r11 -; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq %rax, %rdx ; AVX512-NEXT: addq $-1, %rbx ; AVX512-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill -; AVX512-NEXT: addq $-1, %r8 -; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: addq $-1, %r15 -; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax ; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -2752,94 +2747,108 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill ; AVX512-NEXT: addq $-1, %r10 ; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %eax ; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: movq %rax, %rcx +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %rcx +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: addq $-1, %r8 +; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: addq $-1, %rsi ; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: movl $0, %r12d -; AVX512-NEXT: adcq $-1, %r12 -; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movl $0, %ebx -; AVX512-NEXT: adcq $-1, %rbx -; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill ; AVX512-NEXT: movl $0, %r13d ; AVX512-NEXT: adcq $-1, %r13 -; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: addq $-1, %r11 +; AVX512-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movl $0, %r15d ; AVX512-NEXT: adcq $-1, %r15 ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movl $0, %r11d -; AVX512-NEXT: adcq $-1, %r11 +; AVX512-NEXT: movl $0, %eax +; AVX512-NEXT: adcq $-1, %rax +; AVX512-NEXT: movq %rax, %rsi ; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill -; AVX512-NEXT: movl $0, %r8d -; AVX512-NEXT: adcq $-1, %r8 +; AVX512-NEXT: movl $0, %r12d +; AVX512-NEXT: adcq $-1, %r12 +; AVX512-NEXT: addq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill +; AVX512-NEXT: movl $0, %ebx +; AVX512-NEXT: adcq $-1, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload ; AVX512-NEXT: addq $-1, %rbp +; AVX512-NEXT: movl $0, %r11d +; AVX512-NEXT: adcq $-1, %r11 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: addq $-1, %rax ; AVX512-NEXT: movl $0, %r10d ; AVX512-NEXT: adcq $-1, %r10 ; AVX512-NEXT: addq $-1, %r14 +; AVX512-NEXT: movl $0, %r8d +; AVX512-NEXT: adcq $-1, %r8 +; AVX512-NEXT: addq $-1, %r9 ; AVX512-NEXT: movl $0, %edi ; AVX512-NEXT: adcq $-1, %rdi -; AVX512-NEXT: addq $-1, %r9 -; AVX512-NEXT: movl $0, %esi -; AVX512-NEXT: adcq $-1, %rsi ; AVX512-NEXT: addq $-1, %rdx -; AVX512-NEXT: movl $0, %eax -; AVX512-NEXT: adcq $-1, %rax -; AVX512-NEXT: shldq $63, %rdx, %rax -; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; AVX512-NEXT: shldq $63, %r9, %rsi -; AVX512-NEXT: shldq $63, %r14, %rdi -; AVX512-NEXT: shldq $63, %rbp, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: shldq $63, %rdx, %r8 +; AVX512-NEXT: movl $0, %ecx +; AVX512-NEXT: adcq $-1, %rcx +; AVX512-NEXT: shldq $63, %rdx, %rcx +; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %r9, %rdi +; AVX512-NEXT: shldq $63, %r14, %r8 +; AVX512-NEXT: shldq $63, %rax, %r10 +; AVX512-NEXT: shldq $63, %rbp, %r11 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: shldq $63, %rdx, %r11 +; AVX512-NEXT: shldq $63, %rdx, %rbx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: shldq $63, %rdx, %r15 +; AVX512-NEXT: shldq $63, %rdx, %r12 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload -; AVX512-NEXT: shldq $63, %rdx, %r13 +; AVX512-NEXT: shldq $63, %rdx, %rsi +; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %rbx +; AVX512-NEXT: shldq $63, %rax, %r15 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %r12 +; AVX512-NEXT: shldq $63, %rax, %r13 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: shldq $63, %rax, %rcx -; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX512-NEXT: shldq $63, %rax, %rsi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: shldq $63, %rax, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: shldq $63, %rdx, %rax -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: movq (%rsp), %r14 # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: shldq $63, %rdx, %r14 -; AVX512-NEXT: movq (%rsp), %r9 # 8-byte Reload +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: shldq $63, %rdx, %r9 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload -; AVX512-NEXT: shldq $63, %rbp, %rdx -; AVX512-NEXT: vmovq %rdx, %xmm8 +; AVX512-NEXT: shldq $63, %rdx, %rbp +; AVX512-NEXT: vmovq %rbp, %xmm8 ; AVX512-NEXT: vmovq %r9, %xmm9 ; AVX512-NEXT: vmovq %r14, %xmm10 ; AVX512-NEXT: vmovq %rax, %xmm11 ; AVX512-NEXT: vmovq %rcx, %xmm12 -; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 8-byte Folded Reload -; AVX512-NEXT: # xmm13 = mem[0],zero -; AVX512-NEXT: vmovq %r12, %xmm14 -; AVX512-NEXT: vmovq %rbx, %xmm15 -; AVX512-NEXT: vmovq %r13, %xmm0 -; AVX512-NEXT: vmovq %r15, %xmm1 -; AVX512-NEXT: vmovq %r11, %xmm2 -; AVX512-NEXT: vmovq %r8, %xmm3 +; AVX512-NEXT: vmovq %rsi, %xmm13 +; AVX512-NEXT: vmovq %r13, %xmm14 +; AVX512-NEXT: vmovq %r15, %xmm15 +; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Folded Reload +; AVX512-NEXT: # xmm0 = mem[0],zero +; AVX512-NEXT: vmovq %r12, %xmm1 +; AVX512-NEXT: vmovq %rbx, %xmm2 +; AVX512-NEXT: vmovq %r11, %xmm3 ; AVX512-NEXT: vmovq %r10, %xmm4 -; AVX512-NEXT: vmovq %rdi, %xmm5 -; AVX512-NEXT: vmovq %rsi, %xmm6 +; AVX512-NEXT: vmovq %r8, %xmm5 +; AVX512-NEXT: vmovq %rdi, %xmm6 ; AVX512-NEXT: vmovq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 8-byte Folded Reload ; AVX512-NEXT: # xmm7 = mem[0],zero ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm8 = xmm9[0],xmm8[0] @@ -2860,7 +2869,7 @@ define void @not_avg_v16i8_wide_constants(<16 x i8>* %a, <16 x i8>* %b) nounwind ; AVX512-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, (%rax) -; AVX512-NEXT: addq $16, %rsp +; AVX512-NEXT: addq $24, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 ; AVX512-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/crash-O0.ll b/llvm/test/CodeGen/X86/crash-O0.ll index deaf19daccc..1a234d45cb2 100644 --- a/llvm/test/CodeGen/X86/crash-O0.ll +++ b/llvm/test/CodeGen/X86/crash-O0.ll @@ -77,11 +77,11 @@ define i64 @addressModeWith32bitIndex(i32 %V) { ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: movq %rcx, %rax ; CHECK-NEXT: cqto -; CHECK-NEXT: movslq %edi, %rcx -; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi ## 8-byte Reload -; CHECK-NEXT: idivq (%rsi,%rcx,8) +; CHECK-NEXT: movslq %edi, %rsi +; CHECK-NEXT: idivq (%rcx,%rsi,8) ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: retq %gep = getelementptr i64, i64* null, i32 %V diff --git a/llvm/test/CodeGen/X86/hoist-spill.ll b/llvm/test/CodeGen/X86/hoist-spill.ll index 040924a6c28..6a3f5ca01e8 100644 --- a/llvm/test/CodeGen/X86/hoist-spill.ll +++ b/llvm/test/CodeGen/X86/hoist-spill.ll @@ -2,7 +2,9 @@ ; Check no spills to the same stack slot after hoisting. ; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp) +; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp) ; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp) +; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp) target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/CodeGen/X86/machine-cse.ll b/llvm/test/CodeGen/X86/machine-cse.ll index 8ce61be555f..b55b43fafa5 100644 --- a/llvm/test/CodeGen/X86/machine-cse.ll +++ b/llvm/test/CodeGen/X86/machine-cse.ll @@ -133,26 +133,25 @@ return: define i8* @bsd_memchr(i8* %s, i32 %a, i32 %c, i64 %n) nounwind ssp { ; CHECK-LABEL: bsd_memchr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testq %rcx, %rcx -; CHECK-NEXT: je .LBB3_5 +; CHECK-NEXT: je .LBB3_4 ; CHECK-NEXT: # %bb.1: # %preheader +; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: movzbl %dl, %edx ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB3_2: # %do.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: cmpl %edx, %esi -; CHECK-NEXT: je .LBB3_3 -; CHECK-NEXT: # %bb.4: # %do.cond +; CHECK-NEXT: je .LBB3_5 +; CHECK-NEXT: # %bb.3: # %do.cond ; CHECK-NEXT: # in Loop: Header=BB3_2 Depth=1 -; CHECK-NEXT: incq %rdi +; CHECK-NEXT: incq %rax ; CHECK-NEXT: decq %rcx ; CHECK-NEXT: jne .LBB3_2 +; CHECK-NEXT: .LBB3_4: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .LBB3_5: # %return ; CHECK-NEXT: retq -; CHECK-NEXT: .LBB3_3: -; CHECK-NEXT: movq %rdi, %rax -; CHECK-NEXT: retq entry: %cmp = icmp eq i64 %n, 0 br i1 %cmp, label %return, label %preheader diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll index 92e5424253f..4cb6daeec1a 100644 --- a/llvm/test/CodeGen/X86/madd.ll +++ b/llvm/test/CodeGen/X86/madd.ll @@ -356,7 +356,7 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB3_1: # %vector.body @@ -365,18 +365,18 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: movdqu 16(%rdi,%rcx,2), %xmm6 ; SSE2-NEXT: movdqu 32(%rdi,%rcx,2), %xmm7 ; SSE2-NEXT: movdqu 48(%rdi,%rcx,2), %xmm9 -; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: pmaddwd %xmm5, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: pmaddwd %xmm6, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: pmaddwd %xmm7, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm1 -; SSE2-NEXT: pmaddwd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: pmaddwd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: pmaddwd %xmm6, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm4 +; SSE2-NEXT: movdqu 32(%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: pmaddwd %xmm7, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqu 48(%rsi,%rcx,2), %xmm0 +; SSE2-NEXT: pmaddwd %xmm9, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm3 ; SSE2-NEXT: addq $16, %rcx ; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB3_1 @@ -385,14 +385,14 @@ define i32 @_Z10test_shortPsS_i_1024(i16* nocapture readonly, i16* nocapture rea ; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm8, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: _Z10test_shortPsS_i_1024: @@ -949,7 +949,7 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm9, %xmm9 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB7_1: # %vector.body @@ -963,9 +963,9 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm0 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 @@ -980,11 +980,11 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 ; SSE2-NEXT: pmaddwd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm2 -; SSE2-NEXT: pmaddwd %xmm1, %xmm2 +; SSE2-NEXT: pmaddwd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm2, %xmm3 ; SSE2-NEXT: addq $32, %rcx ; SSE2-NEXT: cmpq %rcx, %rax @@ -994,14 +994,14 @@ define i32 @_Z9test_charPcS_i_1024(i8* nocapture readonly, i8* nocapture readonl ; SSE2-NEXT: paddd %xmm8, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm8, %xmm9 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm9, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: _Z9test_charPcS_i_1024: diff --git a/llvm/test/CodeGen/X86/mmx-arith.ll b/llvm/test/CodeGen/X86/mmx-arith.ll index 4362a193014..2d24cb8df35 100644 --- a/llvm/test/CodeGen/X86/mmx-arith.ll +++ b/llvm/test/CodeGen/X86/mmx-arith.ll @@ -604,13 +604,12 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { ; ; X64-LABEL: test3: ; X64: # %bb.0: # %entry +; X64-NEXT: xorl %r8d, %r8d ; X64-NEXT: xorl %eax, %eax ; X64-NEXT: testl %edx, %edx -; X64-NEXT: je .LBB3_3 -; X64-NEXT: # %bb.1: # %bb26.preheader -; X64-NEXT: xorl %r8d, %r8d +; X64-NEXT: je .LBB3_2 ; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB3_2: # %bb26 +; X64-NEXT: .LBB3_1: # %bb26 ; X64-NEXT: # =>This Inner Loop Header: Depth=1 ; X64-NEXT: movslq %r8d, %r8 ; X64-NEXT: movq (%rdi,%r8,8), %rcx @@ -618,8 +617,8 @@ define <1 x i64> @test3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind { ; X64-NEXT: addq %rcx, %rax ; X64-NEXT: incl %r8d ; X64-NEXT: cmpl %edx, %r8d -; X64-NEXT: jb .LBB3_2 -; X64-NEXT: .LBB3_3: # %bb31 +; X64-NEXT: jb .LBB3_1 +; X64-NEXT: .LBB3_2: # %bb31 ; X64-NEXT: retq entry: %tmp2942 = icmp eq i32 %count, 0 diff --git a/llvm/test/CodeGen/X86/pr32284.ll b/llvm/test/CodeGen/X86/pr32284.ll index 878c1c5af61..ab6680cf45a 100644 --- a/llvm/test/CodeGen/X86/pr32284.ll +++ b/llvm/test/CodeGen/X86/pr32284.ll @@ -10,12 +10,13 @@ define void @foo() { ; X86-O0-LABEL: foo: ; X86-O0: # %bb.0: # %entry ; X86-O0-NEXT: xorl %eax, %eax -; X86-O0-NEXT: xorl %ecx, %ecx +; X86-O0-NEXT: movl %eax, %ecx +; X86-O0-NEXT: xorl %eax, %eax ; X86-O0-NEXT: movzbl c, %edx -; X86-O0-NEXT: subl %edx, %ecx -; X86-O0-NEXT: movslq %ecx, %rsi -; X86-O0-NEXT: subq %rsi, %rax -; X86-O0-NEXT: movb %al, %dil +; X86-O0-NEXT: subl %edx, %eax +; X86-O0-NEXT: movslq %eax, %rsi +; X86-O0-NEXT: subq %rsi, %rcx +; X86-O0-NEXT: movb %cl, %dil ; X86-O0-NEXT: cmpb $0, %dil ; X86-O0-NEXT: setne %dil ; X86-O0-NEXT: andb $1, %dil @@ -25,13 +26,13 @@ define void @foo() { ; X86-O0-NEXT: xorb $-1, %dil ; X86-O0-NEXT: xorb $-1, %dil ; X86-O0-NEXT: andb $1, %dil -; X86-O0-NEXT: movzbl %dil, %ecx +; X86-O0-NEXT: movzbl %dil, %eax ; X86-O0-NEXT: movzbl c, %edx -; X86-O0-NEXT: cmpl %edx, %ecx +; X86-O0-NEXT: cmpl %edx, %eax ; X86-O0-NEXT: setle %dil ; X86-O0-NEXT: andb $1, %dil -; X86-O0-NEXT: movzbl %dil, %ecx -; X86-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X86-O0-NEXT: movzbl %dil, %eax +; X86-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) ; X86-O0-NEXT: retq ; ; X64-LABEL: foo: diff --git a/llvm/test/CodeGen/X86/pr32340.ll b/llvm/test/CodeGen/X86/pr32340.ll index 559bd8d6b5a..b530bb18c93 100644 --- a/llvm/test/CodeGen/X86/pr32340.ll +++ b/llvm/test/CodeGen/X86/pr32340.ll @@ -14,21 +14,22 @@ define void @foo() { ; X64-LABEL: foo: ; X64: # %bb.0: # %entry ; X64-NEXT: xorl %eax, %eax +; X64-NEXT: movl %eax, %ecx ; X64-NEXT: movw $0, var_825 -; X64-NEXT: movzwl var_32, %ecx +; X64-NEXT: movzwl var_32, %eax ; X64-NEXT: movzwl var_901, %edx -; X64-NEXT: movl %ecx, %esi +; X64-NEXT: movl %eax, %esi ; X64-NEXT: xorl %edx, %esi -; X64-NEXT: movl %ecx, %edx +; X64-NEXT: movl %eax, %edx ; X64-NEXT: xorl %esi, %edx -; X64-NEXT: addl %ecx, %edx +; X64-NEXT: addl %eax, %edx ; X64-NEXT: movslq %edx, %rdi ; X64-NEXT: movq %rdi, var_826 -; X64-NEXT: movzwl var_32, %ecx -; X64-NEXT: movl %ecx, %edi -; X64-NEXT: movzwl var_901, %ecx -; X64-NEXT: xorl $51981, %ecx # imm = 0xCB0D -; X64-NEXT: movslq %ecx, %r8 +; X64-NEXT: movzwl var_32, %eax +; X64-NEXT: movl %eax, %edi +; X64-NEXT: movzwl var_901, %eax +; X64-NEXT: xorl $51981, %eax # imm = 0xCB0D +; X64-NEXT: movslq %eax, %r8 ; X64-NEXT: movabsq $-1142377792914660288, %r9 # imm = 0xF02575732E06E440 ; X64-NEXT: xorq %r9, %r8 ; X64-NEXT: movq %rdi, %r9 @@ -40,11 +41,11 @@ define void @foo() { ; X64-NEXT: orq %r8, %rdi ; X64-NEXT: movw %di, %r10w ; X64-NEXT: movw %r10w, var_900 -; X64-NEXT: cmpq var_28, %rax +; X64-NEXT: cmpq var_28, %rcx ; X64-NEXT: setne %r11b ; X64-NEXT: andb $1, %r11b -; X64-NEXT: movzbl %r11b, %ecx -; X64-NEXT: movw %cx, %r10w +; X64-NEXT: movzbl %r11b, %eax +; X64-NEXT: movw %ax, %r10w ; X64-NEXT: movw %r10w, var_827 ; X64-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/scheduler-backtracking.ll b/llvm/test/CodeGen/X86/scheduler-backtracking.ll index e558fed7436..811bd9bd031 100644 --- a/llvm/test/CodeGen/X86/scheduler-backtracking.ll +++ b/llvm/test/CodeGen/X86/scheduler-backtracking.ll @@ -20,18 +20,18 @@ define i256 @test1(i256 %a) nounwind { ; ILP-NEXT: pushq %rbx ; ILP-NEXT: movq %rcx, %r9 ; ILP-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; ILP-NEXT: xorl %eax, %eax ; ILP-NEXT: addq $1, %rsi ; ILP-NEXT: adcq $0, %rdx ; ILP-NEXT: adcq $0, %r9 ; ILP-NEXT: adcq $0, %r8 ; ILP-NEXT: leal 1(%rsi,%rsi), %edi ; ILP-NEXT: movl $1, %ebp -; ILP-NEXT: xorl %eax, %eax -; ILP-NEXT: xorl %r11d, %r11d +; ILP-NEXT: xorl %r14d, %r14d ; ILP-NEXT: movl %edi, %ecx -; ILP-NEXT: shldq %cl, %rbp, %r11 -; ILP-NEXT: movl $1, %r14d -; ILP-NEXT: shlq %cl, %r14 +; ILP-NEXT: shldq %cl, %rbp, %r14 +; ILP-NEXT: movl $1, %r11d +; ILP-NEXT: shlq %cl, %r11 ; ILP-NEXT: movb $-128, %r10b ; ILP-NEXT: subb %dil, %r10b ; ILP-NEXT: movq %r9, %r13 @@ -42,33 +42,33 @@ define i256 @test1(i256 %a) nounwind { ; ILP-NEXT: xorl %r15d, %r15d ; ILP-NEXT: movl %edi, %ecx ; ILP-NEXT: shldq %cl, %r15, %r15 -; ILP-NEXT: movq %rsi, %rbp -; ILP-NEXT: shrdq %cl, %rdx, %rbp +; ILP-NEXT: movq %rsi, %rbx +; ILP-NEXT: shrdq %cl, %rdx, %rbx ; ILP-NEXT: shrq %cl, %rdx ; ILP-NEXT: addb $-128, %cl ; ILP-NEXT: shrdq %cl, %r8, %r9 ; ILP-NEXT: testb $64, %dil -; ILP-NEXT: cmovneq %r14, %r11 -; ILP-NEXT: cmoveq %rbp, %rdx +; ILP-NEXT: cmovneq %r11, %r14 +; ILP-NEXT: cmoveq %rbx, %rdx ; ILP-NEXT: cmovneq %rax, %r15 -; ILP-NEXT: cmovneq %rax, %r14 +; ILP-NEXT: cmovneq %rax, %r11 ; ILP-NEXT: testb $64, %r10b ; ILP-NEXT: cmovneq %rax, %r12 ; ILP-NEXT: cmovneq %rax, %r13 -; ILP-NEXT: movl $1, %ebp -; ILP-NEXT: shlq %cl, %rbp +; ILP-NEXT: movl $1, %ebx +; ILP-NEXT: shlq %cl, %rbx ; ILP-NEXT: orl %edx, %r13d ; ILP-NEXT: xorl %edx, %edx -; ILP-NEXT: movl $1, %ebx -; ILP-NEXT: shldq %cl, %rbx, %rdx +; ILP-NEXT: movl $1, %ebp +; ILP-NEXT: shldq %cl, %rbp, %rdx ; ILP-NEXT: shrq %cl, %r8 ; ILP-NEXT: testb $64, %cl ; ILP-NEXT: cmoveq %r9, %r8 -; ILP-NEXT: cmovneq %rbp, %rdx -; ILP-NEXT: cmovneq %rax, %rbp +; ILP-NEXT: cmovneq %rbx, %rdx +; ILP-NEXT: cmovneq %rax, %rbx ; ILP-NEXT: testb %dil, %dil -; ILP-NEXT: cmovsq %rax, %r11 ; ILP-NEXT: cmovsq %rax, %r14 +; ILP-NEXT: cmovsq %rax, %r11 ; ILP-NEXT: jns .LBB0_2 ; ILP-NEXT: # %bb.1: ; ILP-NEXT: movl %r8d, %r13d @@ -77,20 +77,20 @@ define i256 @test1(i256 %a) nounwind { ; ILP-NEXT: # %bb.3: ; ILP-NEXT: movl %r13d, %esi ; ILP-NEXT: .LBB0_4: -; ILP-NEXT: cmovnsq %r12, %rbp -; ILP-NEXT: cmoveq %rax, %rbp +; ILP-NEXT: cmovnsq %r12, %rbx +; ILP-NEXT: cmoveq %rax, %rbx ; ILP-NEXT: cmovnsq %r15, %rdx ; ILP-NEXT: cmoveq %rax, %rdx ; ILP-NEXT: testb $1, %sil ; ILP-NEXT: cmovneq %rax, %rdx ; ILP-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; ILP-NEXT: movq %rdx, 24(%rax) -; ILP-NEXT: cmovneq %rax, %rbp -; ILP-NEXT: movq %rbp, 16(%rax) -; ILP-NEXT: cmovneq %rax, %r11 -; ILP-NEXT: movq %r11, 8(%rax) +; ILP-NEXT: cmovneq %rax, %rbx +; ILP-NEXT: movq %rbx, 16(%rax) ; ILP-NEXT: cmovneq %rax, %r14 -; ILP-NEXT: movq %r14, (%rax) +; ILP-NEXT: movq %r14, 8(%rax) +; ILP-NEXT: cmovneq %rax, %r11 +; ILP-NEXT: movq %r11, (%rax) ; ILP-NEXT: popq %rbx ; ILP-NEXT: popq %r12 ; ILP-NEXT: popq %r13 @@ -101,6 +101,7 @@ define i256 @test1(i256 %a) nounwind { ; ; HYBRID-LABEL: test1: ; HYBRID: # %bb.0: +; HYBRID-NEXT: pushq %rbp ; HYBRID-NEXT: pushq %r15 ; HYBRID-NEXT: pushq %r14 ; HYBRID-NEXT: pushq %r13 @@ -112,82 +113,84 @@ define i256 @test1(i256 %a) nounwind { ; HYBRID-NEXT: adcq $0, %rdx ; HYBRID-NEXT: adcq $0, %r9 ; HYBRID-NEXT: adcq $0, %r8 +; HYBRID-NEXT: xorl %r10d, %r10d ; HYBRID-NEXT: leal 1(%rsi,%rsi), %edi ; HYBRID-NEXT: xorl %r14d, %r14d -; HYBRID-NEXT: xorl %r15d, %r15d ; HYBRID-NEXT: movl %edi, %ecx -; HYBRID-NEXT: shldq %cl, %r15, %r15 +; HYBRID-NEXT: shldq %cl, %r14, %r14 ; HYBRID-NEXT: testb $64, %dil -; HYBRID-NEXT: cmovneq %r14, %r15 -; HYBRID-NEXT: movl $1, %r11d +; HYBRID-NEXT: cmovneq %r10, %r14 +; HYBRID-NEXT: movl $1, %ebp ; HYBRID-NEXT: movl $1, %r12d ; HYBRID-NEXT: shlq %cl, %r12 ; HYBRID-NEXT: testb $64, %dil -; HYBRID-NEXT: movq %r12, %r10 -; HYBRID-NEXT: cmovneq %r14, %r10 +; HYBRID-NEXT: movq %r12, %r11 +; HYBRID-NEXT: cmovneq %r10, %r11 ; HYBRID-NEXT: movq %rsi, %rbx ; HYBRID-NEXT: shrdq %cl, %rdx, %rbx ; HYBRID-NEXT: shrq %cl, %rdx ; HYBRID-NEXT: testb $64, %dil ; HYBRID-NEXT: cmoveq %rbx, %rdx -; HYBRID-NEXT: xorl %r13d, %r13d -; HYBRID-NEXT: shldq %cl, %r11, %r13 +; HYBRID-NEXT: xorl %r15d, %r15d +; HYBRID-NEXT: shldq %cl, %rbp, %r15 ; HYBRID-NEXT: testb $64, %dil -; HYBRID-NEXT: cmovneq %r12, %r13 +; HYBRID-NEXT: cmovneq %r12, %r15 ; HYBRID-NEXT: movb $-128, %cl ; HYBRID-NEXT: subb %dil, %cl -; HYBRID-NEXT: movq %r9, %rbx -; HYBRID-NEXT: shlq %cl, %rbx +; HYBRID-NEXT: movq %r9, %r13 +; HYBRID-NEXT: shlq %cl, %r13 ; HYBRID-NEXT: movl $1, %r12d -; HYBRID-NEXT: shrdq %cl, %r14, %r12 +; HYBRID-NEXT: shrdq %cl, %r10, %r12 ; HYBRID-NEXT: testb $64, %cl -; HYBRID-NEXT: cmovneq %r14, %r12 -; HYBRID-NEXT: cmovneq %r14, %rbx -; HYBRID-NEXT: orl %edx, %ebx +; HYBRID-NEXT: cmovneq %r10, %r12 +; HYBRID-NEXT: cmovneq %r10, %r13 +; HYBRID-NEXT: orl %edx, %r13d ; HYBRID-NEXT: movl %edi, %ecx ; HYBRID-NEXT: addb $-128, %cl ; HYBRID-NEXT: shrdq %cl, %r8, %r9 ; HYBRID-NEXT: shrq %cl, %r8 ; HYBRID-NEXT: xorl %edx, %edx -; HYBRID-NEXT: shldq %cl, %r11, %rdx -; HYBRID-NEXT: shlq %cl, %r11 +; HYBRID-NEXT: shldq %cl, %rbp, %rdx +; HYBRID-NEXT: shlq %cl, %rbp ; HYBRID-NEXT: testb $64, %cl -; HYBRID-NEXT: cmovneq %r11, %rdx +; HYBRID-NEXT: cmovneq %rbp, %rdx ; HYBRID-NEXT: cmoveq %r9, %r8 -; HYBRID-NEXT: cmovneq %r14, %r11 +; HYBRID-NEXT: cmovneq %r10, %rbp ; HYBRID-NEXT: testb %dil, %dil ; HYBRID-NEXT: jns .LBB0_2 ; HYBRID-NEXT: # %bb.1: -; HYBRID-NEXT: movl %r8d, %ebx +; HYBRID-NEXT: movl %r8d, %r13d ; HYBRID-NEXT: .LBB0_2: ; HYBRID-NEXT: je .LBB0_4 ; HYBRID-NEXT: # %bb.3: -; HYBRID-NEXT: movl %ebx, %esi +; HYBRID-NEXT: movl %r13d, %esi ; HYBRID-NEXT: .LBB0_4: -; HYBRID-NEXT: cmovsq %r14, %r13 -; HYBRID-NEXT: cmovnsq %r12, %r11 -; HYBRID-NEXT: cmoveq %r14, %r11 -; HYBRID-NEXT: cmovnsq %r15, %rdx -; HYBRID-NEXT: cmoveq %r14, %rdx -; HYBRID-NEXT: cmovsq %r14, %r10 +; HYBRID-NEXT: cmovsq %r10, %r15 +; HYBRID-NEXT: cmovnsq %r12, %rbp +; HYBRID-NEXT: cmoveq %r10, %rbp +; HYBRID-NEXT: cmovnsq %r14, %rdx +; HYBRID-NEXT: cmoveq %r10, %rdx +; HYBRID-NEXT: cmovsq %r10, %r11 ; HYBRID-NEXT: testb $1, %sil ; HYBRID-NEXT: cmovneq %rax, %rdx ; HYBRID-NEXT: movq %rdx, 24(%rax) +; HYBRID-NEXT: cmovneq %rax, %rbp +; HYBRID-NEXT: movq %rbp, 16(%rax) +; HYBRID-NEXT: cmovneq %rax, %r15 +; HYBRID-NEXT: movq %r15, 8(%rax) ; HYBRID-NEXT: cmovneq %rax, %r11 -; HYBRID-NEXT: movq %r11, 16(%rax) -; HYBRID-NEXT: cmovneq %rax, %r13 -; HYBRID-NEXT: movq %r13, 8(%rax) -; HYBRID-NEXT: cmovneq %rax, %r10 -; HYBRID-NEXT: movq %r10, (%rax) +; HYBRID-NEXT: movq %r11, (%rax) ; HYBRID-NEXT: popq %rbx ; HYBRID-NEXT: popq %r12 ; HYBRID-NEXT: popq %r13 ; HYBRID-NEXT: popq %r14 ; HYBRID-NEXT: popq %r15 +; HYBRID-NEXT: popq %rbp ; HYBRID-NEXT: retq ; ; BURR-LABEL: test1: ; BURR: # %bb.0: +; BURR-NEXT: pushq %rbp ; BURR-NEXT: pushq %r15 ; BURR-NEXT: pushq %r14 ; BURR-NEXT: pushq %r13 @@ -199,78 +202,79 @@ define i256 @test1(i256 %a) nounwind { ; BURR-NEXT: adcq $0, %rdx ; BURR-NEXT: adcq $0, %r9 ; BURR-NEXT: adcq $0, %r8 +; BURR-NEXT: xorl %r10d, %r10d ; BURR-NEXT: leal 1(%rsi,%rsi), %edi ; BURR-NEXT: xorl %r14d, %r14d -; BURR-NEXT: xorl %r15d, %r15d ; BURR-NEXT: movl %edi, %ecx -; BURR-NEXT: shldq %cl, %r15, %r15 +; BURR-NEXT: shldq %cl, %r14, %r14 ; BURR-NEXT: testb $64, %dil -; BURR-NEXT: cmovneq %r14, %r15 -; BURR-NEXT: movl $1, %r11d +; BURR-NEXT: cmovneq %r10, %r14 +; BURR-NEXT: movl $1, %ebp ; BURR-NEXT: movl $1, %r12d ; BURR-NEXT: shlq %cl, %r12 ; BURR-NEXT: testb $64, %dil -; BURR-NEXT: movq %r12, %r10 -; BURR-NEXT: cmovneq %r14, %r10 +; BURR-NEXT: movq %r12, %r11 +; BURR-NEXT: cmovneq %r10, %r11 ; BURR-NEXT: movq %rsi, %rbx ; BURR-NEXT: shrdq %cl, %rdx, %rbx ; BURR-NEXT: shrq %cl, %rdx ; BURR-NEXT: testb $64, %dil ; BURR-NEXT: cmoveq %rbx, %rdx -; BURR-NEXT: xorl %r13d, %r13d -; BURR-NEXT: shldq %cl, %r11, %r13 +; BURR-NEXT: xorl %r15d, %r15d +; BURR-NEXT: shldq %cl, %rbp, %r15 ; BURR-NEXT: testb $64, %dil -; BURR-NEXT: cmovneq %r12, %r13 +; BURR-NEXT: cmovneq %r12, %r15 ; BURR-NEXT: movb $-128, %cl ; BURR-NEXT: subb %dil, %cl -; BURR-NEXT: movq %r9, %rbx -; BURR-NEXT: shlq %cl, %rbx +; BURR-NEXT: movq %r9, %r13 +; BURR-NEXT: shlq %cl, %r13 ; BURR-NEXT: movl $1, %r12d -; BURR-NEXT: shrdq %cl, %r14, %r12 +; BURR-NEXT: shrdq %cl, %r10, %r12 ; BURR-NEXT: testb $64, %cl -; BURR-NEXT: cmovneq %r14, %r12 -; BURR-NEXT: cmovneq %r14, %rbx -; BURR-NEXT: orl %edx, %ebx +; BURR-NEXT: cmovneq %r10, %r12 +; BURR-NEXT: cmovneq %r10, %r13 +; BURR-NEXT: orl %edx, %r13d ; BURR-NEXT: movl %edi, %ecx ; BURR-NEXT: addb $-128, %cl ; BURR-NEXT: shrdq %cl, %r8, %r9 ; BURR-NEXT: xorl %edx, %edx -; BURR-NEXT: shldq %cl, %r11, %rdx +; BURR-NEXT: shldq %cl, %rbp, %rdx ; BURR-NEXT: shrq %cl, %r8 -; BURR-NEXT: shlq %cl, %r11 +; BURR-NEXT: shlq %cl, %rbp ; BURR-NEXT: testb $64, %cl -; BURR-NEXT: cmovneq %r11, %rdx +; BURR-NEXT: cmovneq %rbp, %rdx ; BURR-NEXT: cmoveq %r9, %r8 -; BURR-NEXT: cmovneq %r14, %r11 +; BURR-NEXT: cmovneq %r10, %rbp ; BURR-NEXT: testb %dil, %dil ; BURR-NEXT: jns .LBB0_2 ; BURR-NEXT: # %bb.1: -; BURR-NEXT: movl %r8d, %ebx +; BURR-NEXT: movl %r8d, %r13d ; BURR-NEXT: .LBB0_2: ; BURR-NEXT: je .LBB0_4 ; BURR-NEXT: # %bb.3: -; BURR-NEXT: movl %ebx, %esi +; BURR-NEXT: movl %r13d, %esi ; BURR-NEXT: .LBB0_4: -; BURR-NEXT: cmovsq %r14, %r13 -; BURR-NEXT: cmovnsq %r12, %r11 -; BURR-NEXT: cmoveq %r14, %r11 -; BURR-NEXT: cmovnsq %r15, %rdx -; BURR-NEXT: cmoveq %r14, %rdx -; BURR-NEXT: cmovsq %r14, %r10 +; BURR-NEXT: cmovsq %r10, %r15 +; BURR-NEXT: cmovnsq %r12, %rbp +; BURR-NEXT: cmoveq %r10, %rbp +; BURR-NEXT: cmovnsq %r14, %rdx +; BURR-NEXT: cmoveq %r10, %rdx +; BURR-NEXT: cmovsq %r10, %r11 ; BURR-NEXT: testb $1, %sil ; BURR-NEXT: cmovneq %rax, %rdx ; BURR-NEXT: movq %rdx, 24(%rax) +; BURR-NEXT: cmovneq %rax, %rbp +; BURR-NEXT: movq %rbp, 16(%rax) +; BURR-NEXT: cmovneq %rax, %r15 +; BURR-NEXT: movq %r15, 8(%rax) ; BURR-NEXT: cmovneq %rax, %r11 -; BURR-NEXT: movq %r11, 16(%rax) -; BURR-NEXT: cmovneq %rax, %r13 -; BURR-NEXT: movq %r13, 8(%rax) -; BURR-NEXT: cmovneq %rax, %r10 -; BURR-NEXT: movq %r10, (%rax) +; BURR-NEXT: movq %r11, (%rax) ; BURR-NEXT: popq %rbx ; BURR-NEXT: popq %r12 ; BURR-NEXT: popq %r13 ; BURR-NEXT: popq %r14 ; BURR-NEXT: popq %r15 +; BURR-NEXT: popq %rbp ; BURR-NEXT: retq ; ; SRC-LABEL: test1: @@ -297,8 +301,8 @@ define i256 @test1(i256 %a) nounwind { ; SRC-NEXT: movl %r11d, %ecx ; SRC-NEXT: shrdq %cl, %rdx, %rbp ; SRC-NEXT: shrq %cl, %rdx -; SRC-NEXT: movl $1, %edi ; SRC-NEXT: xorl %r15d, %r15d +; SRC-NEXT: movl $1, %edi ; SRC-NEXT: xorl %r14d, %r14d ; SRC-NEXT: shldq %cl, %rdi, %r14 ; SRC-NEXT: xorl %r13d, %r13d @@ -906,15 +910,15 @@ define i64 @test4(i64 %a, i64 %b) nounwind { ; ILP-LABEL: test4: ; ILP: # %bb.0: ; ILP-NEXT: xorl %ecx, %ecx +; ILP-NEXT: xorl %edx, %edx ; ILP-NEXT: addq $1, %rsi -; ILP-NEXT: setb %cl +; ILP-NEXT: setb %dl ; ILP-NEXT: movl $2, %eax -; ILP-NEXT: xorl %edx, %edx ; ILP-NEXT: cmpq %rdi, %rsi -; ILP-NEXT: sbbq $0, %rcx -; ILP-NEXT: movl $0, %ecx -; ILP-NEXT: sbbq $0, %rcx ; ILP-NEXT: sbbq $0, %rdx +; ILP-NEXT: movl $0, %edx +; ILP-NEXT: sbbq $0, %rdx +; ILP-NEXT: sbbq $0, %rcx ; ILP-NEXT: setae %cl ; ILP-NEXT: movzbl %cl, %ecx ; ILP-NEXT: subq %rcx, %rax @@ -923,14 +927,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind { ; HYBRID-LABEL: test4: ; HYBRID: # %bb.0: ; HYBRID-NEXT: xorl %eax, %eax -; HYBRID-NEXT: addq $1, %rsi -; HYBRID-NEXT: setb %al ; HYBRID-NEXT: xorl %ecx, %ecx +; HYBRID-NEXT: addq $1, %rsi +; HYBRID-NEXT: setb %cl ; HYBRID-NEXT: cmpq %rdi, %rsi -; HYBRID-NEXT: sbbq $0, %rax -; HYBRID-NEXT: movl $0, %eax -; HYBRID-NEXT: sbbq $0, %rax ; HYBRID-NEXT: sbbq $0, %rcx +; HYBRID-NEXT: movl $0, %ecx +; HYBRID-NEXT: sbbq $0, %rcx +; HYBRID-NEXT: sbbq $0, %rax ; HYBRID-NEXT: setae %al ; HYBRID-NEXT: movzbl %al, %ecx ; HYBRID-NEXT: movl $2, %eax @@ -940,14 +944,14 @@ define i64 @test4(i64 %a, i64 %b) nounwind { ; BURR-LABEL: test4: ; BURR: # %bb.0: ; BURR-NEXT: xorl %eax, %eax -; BURR-NEXT: addq $1, %rsi -; BURR-NEXT: setb %al ; BURR-NEXT: xorl %ecx, %ecx +; BURR-NEXT: addq $1, %rsi +; BURR-NEXT: setb %cl ; BURR-NEXT: cmpq %rdi, %rsi -; BURR-NEXT: sbbq $0, %rax -; BURR-NEXT: movl $0, %eax -; BURR-NEXT: sbbq $0, %rax ; BURR-NEXT: sbbq $0, %rcx +; BURR-NEXT: movl $0, %ecx +; BURR-NEXT: sbbq $0, %rcx +; BURR-NEXT: sbbq $0, %rax ; BURR-NEXT: setae %al ; BURR-NEXT: movzbl %al, %ecx ; BURR-NEXT: movl $2, %eax diff --git a/llvm/test/CodeGen/X86/spill-zero-x86_64.ll b/llvm/test/CodeGen/X86/spill-zero-x86_64.ll deleted file mode 100644 index d90cca6eabd..00000000000 --- a/llvm/test/CodeGen/X86/spill-zero-x86_64.ll +++ /dev/null @@ -1,75 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s - -; This test checks that we use "movq $0, (%rsp)" to spill a 0 to the stack. It -; was reduced from a larger function. - -; CHECK: movq $0, (%rsp) # 8-byte Folded Spill - -%struct.foo = type { i8*, i32 } - -declare void @pluto() - -define void @spam() { -bb: - br label %bb13 - -bb1: ; preds = %bb18 - call void @pluto() - %tmp = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 1 - %tmp2 = bitcast i32* %tmp to %struct.foo** - store %struct.foo* null, %struct.foo** %tmp2 - unreachable - -bb3: ; preds = %bb18 - call void @pluto() - store i8* %tmp22, i8** undef - unreachable - -bb4: ; preds = %bb18 - call void @pluto() - br label %bb13 - -bb5: ; preds = %bb18 - %tmp7 = add nsw i32 %tmp23, 1 - store i8* %tmp22, i8** undef - unreachable - -bb8: ; preds = %bb18 - store %struct.foo* %tmp14, %struct.foo** undef - unreachable - -bb9: ; preds = %bb18 - %tmp10 = load %struct.foo*, %struct.foo** undef - br label %bb13 - -bb13: ; preds = %bb18, %bb9, %bb4, %bb - %tmp14 = phi %struct.foo* [ %tmp14, %bb18 ], [ %tmp14, %bb4 ], [ null, %bb ], [ %tmp10, %bb9 ] - %tmp15 = phi %struct.foo* [ %tmp26, %bb18 ], [ %tmp26, %bb4 ], [ null, %bb ], [ %tmp26, %bb9 ] - %tmp16 = phi i32 [ %tmp23, %bb18 ], [ %tmp23, %bb4 ], [ 0, %bb ], [ %tmp23, %bb9 ] - br label %bb17 - -bb17: ; preds = %bb13 - br i1 false, label %bb27, label %bb18 - -bb18: ; preds = %bb17 - %tmp19 = load %struct.foo*, %struct.foo** undef - %tmp20 = getelementptr inbounds %struct.foo, %struct.foo* %tmp19, i64 0 - %tmp21 = getelementptr inbounds %struct.foo, %struct.foo* %tmp20, i64 0, i32 0 - %tmp22 = load i8*, i8** %tmp21 - %tmp23 = add nsw i32 %tmp16, -1 - %tmp24 = getelementptr inbounds %struct.foo, %struct.foo* %tmp15, i64 0, i32 1 - %tmp25 = bitcast i32* %tmp24 to %struct.foo** - %tmp26 = load %struct.foo*, %struct.foo** %tmp25 - switch i32 undef, label %bb9 [ - i32 1, label %bb1 - i32 2, label %bb3 - i32 3, label %bb4 - i32 4, label %bb5 - i32 5, label %bb13 - i32 6, label %bb8 - ] - -bb27: ; preds = %bb17 - ret void -} diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll index a88a714f016..cb0597f7151 100644 --- a/llvm/test/CodeGen/X86/swifterror.ll +++ b/llvm/test/CodeGen/X86/swifterror.ll @@ -41,7 +41,8 @@ define float @caller(i8* %error_ref) { ; CHECK-APPLE: callq {{.*}}free ; CHECK-O0-LABEL: caller: -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl +; CHECK-O0: movl %{{.*}}, %r12d ; CHECK-O0: callq {{.*}}foo ; CHECK-O0: jne entry: @@ -77,7 +78,8 @@ define float @caller2(i8* %error_ref) { ; CHECK-APPLE: callq {{.*}}free ; CHECK-O0-LABEL: caller2: -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl +; CHECK-O0: movl %{{.*}}, %r12d ; CHECK-O0: callq {{.*}}foo ; CHECK-O0: movq %r12, [[ID:%[a-z]+]] ; CHECK-O0: cmpq $0, %r12 @@ -252,7 +254,8 @@ define float @caller3(i8* %error_ref) { ; CHECK-APPLE: callq {{.*}}free ; CHECK-O0-LABEL: caller3: -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl +; CHECK-O0: movl {{.*}}, %r12d ; CHECK-O0: movl $1, %esi ; CHECK-O0: movq {{.*}}, %rdi ; CHECK-O0: callq {{.*}}foo_sret @@ -310,12 +313,14 @@ define float @caller_with_multiple_swifterror_values(i8* %error_ref, i8* %error_ ; CHECK-O0-LABEL: caller_with_multiple_swifterror_values: ; The first swifterror value: -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl +; CHECK-O0: movl %{{.*}}, %r12d ; CHECK-O0: callq {{.*}}foo ; CHECK-O0: jne ; The second swifterror value: -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl +; CHECK-O0: movl %{{.*}}, %r12d ; CHECK-O0: callq {{.*}}foo ; CHECK-O0: jne entry: @@ -710,7 +715,8 @@ declare swiftcc void @foo2(%swift_error** swifterror) ; Make sure we properly assign registers during fast-isel. ; CHECK-O0-LABEL: testAssign ; CHECK-O0: pushq %r12 -; CHECK-O0: xorl %r12d, %r12d +; CHECK-O0: xorl [[ZERO:%[a-z0-9]+]], [[ZERO]] +; CHECK-O0: movl [[ZERO]], %r12d ; CHECK-O0: callq _foo2 ; CHECK-O0: movq %r12, [[SLOT:[-a-z0-9\(\)\%]*]] ; @@ -786,7 +792,8 @@ a: ; CHECK-O0-LABEL: testAssign4 ; CHECK-O0: callq _foo2 -; CHECK-O0: xorl %eax, %eax +; CHECK-O0: xorl %ecx, %ecx +; CHECK-O0: movl %ecx, %eax ; CHECK-O0: movq %rax, [[SLOT:[-a-z0-9\(\)\%]*]] ; CHECK-O0: movq [[SLOT]], %rax ; CHECK-O0: movq %rax, [[SLOT2:[-a-z0-9\(\)\%]*]] |