summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/include/llvm/CodeGen/LiveRegMatrix.h7
-rw-r--r--llvm/lib/CodeGen/LiveRegMatrix.cpp16
-rw-r--r--llvm/lib/CodeGen/RegAllocGreedy.cpp82
-rw-r--r--llvm/test/CodeGen/X86/bug26810.ll3
-rw-r--r--llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll89
-rw-r--r--llvm/test/CodeGen/X86/sad.ll239
6 files changed, 312 insertions, 124 deletions
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index fa6827f6b1f..f62a55c7308 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -107,6 +107,13 @@ public:
/// with the highest enum value is returned.
InterferenceKind checkInterference(LiveInterval &VirtReg, unsigned PhysReg);
+ /// Check for interference in the segment [Start, End) that may prevent
+ /// assignment to PhysReg. If this function returns true, there is
+ /// interference in the segment [Start, End) of some other interval already
+ /// assigned to PhysReg. If this function returns false, PhysReg is free at
+ /// the segment [Start, End).
+ bool checkInterference(SlotIndex Start, SlotIndex End, unsigned PhysReg);
+
/// Assign VirtReg to PhysReg.
/// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
/// update VirtRegMap. The live range is expected to be available in PhysReg.
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index bd435968296..d8faf75466c 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -205,3 +205,19 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
return IK_Free;
}
+
+bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
+ unsigned PhysReg) {
+ // Construct artificial live range containing only one segment [Start, End).
+ VNInfo valno(0, Start);
+ LiveRange::Segment Seg(Start, End, &valno);
+ LiveRange LR;
+ LR.addSegment(Seg);
+
+ // Check for interference with that segment
+ for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+ if (query(LR, *Units).checkInterference())
+ return true;
+ }
+ return false;
+}
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index e4801c48efd..80349457783 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -448,6 +448,9 @@ private:
bool splitCanCauseEvictionChain(unsigned Evictee, GlobalSplitCandidate &Cand,
unsigned BBNumber,
const AllocationOrder &Order);
+ bool splitCanCauseLocalSpill(unsigned VirtRegToSplit,
+ GlobalSplitCandidate &Cand, unsigned BBNumber,
+ const AllocationOrder &Order);
BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate &,
const AllocationOrder &Order,
bool *CanCauseEvictionChain);
@@ -1427,7 +1430,7 @@ BlockFrequency RAGreedy::calcSpillCost() {
/// we are splitting for and the interferences.
/// \param BBNumber The number of a BB for which the region split process will
/// create a local split interval.
-/// \param Order The phisical registers that may get evicted by a split
+/// \param Order The physical registers that may get evicted by a split
/// artifact of Evictee.
/// \return True if splitting Evictee may cause a bad eviction chain, false
/// otherwise.
@@ -1448,8 +1451,8 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
getCheapestEvicteeWeight(Order, LIS->getInterval(Evictee),
Cand.Intf.first(), Cand.Intf.last(), &MaxWeight);
- // The bad eviction chain occurs when either the split candidate the
- // evited reg or one of the split artifact will evict the evicting reg.
+ // The bad eviction chain occurs when either the split candidate is the
+ // evicting reg or one of the split artifact will evict the evicting reg.
if ((PhysReg != Cand.PhysReg) && (PhysReg != FutureEvictedPhysReg))
return false;
@@ -1479,6 +1482,54 @@ bool RAGreedy::splitCanCauseEvictionChain(unsigned Evictee,
return true;
}
+/// \brief Check if splitting VirtRegToSplit will create a local split interval
+/// in basic block number BBNumber that may cause a spill.
+///
+/// \param VirtRegToSplit The register considered to be split.
+/// \param Cand The split candidate that determines the physical
+/// register we are splitting for and the interferences.
+/// \param BBNumber The number of a BB for which the region split process
+/// will create a local split interval.
+/// \param Order The physical registers that may get evicted by a
+/// split artifact of VirtRegToSplit.
+/// \return True if splitting VirtRegToSplit may cause a spill, false
+/// otherwise.
+bool RAGreedy::splitCanCauseLocalSpill(unsigned VirtRegToSplit,
+ GlobalSplitCandidate &Cand,
+ unsigned BBNumber,
+ const AllocationOrder &Order) {
+ Cand.Intf.moveToBlock(BBNumber);
+
+ // Check if the local interval will find a non interfereing assignment.
+ for (auto PhysReg : Order.getOrder()) {
+ if (!Matrix->checkInterference(Cand.Intf.first().getPrevIndex(),
+ Cand.Intf.last(), PhysReg))
+ return false;
+ }
+
+ // Check if the local interval will evict a cheaper interval.
+ float CheapestEvictWeight = 0;
+ unsigned FutureEvictedPhysReg = getCheapestEvicteeWeight(
+ Order, LIS->getInterval(VirtRegToSplit), Cand.Intf.first(),
+ Cand.Intf.last(), &CheapestEvictWeight);
+
+ // Have we found an interval that can be evicted?
+ if (FutureEvictedPhysReg) {
+ VirtRegAuxInfo VRAI(*MF, *LIS, VRM, getAnalysis<MachineLoopInfo>(), *MBFI);
+ float splitArtifactWeight =
+ VRAI.futureWeight(LIS->getInterval(VirtRegToSplit),
+ Cand.Intf.first().getPrevIndex(), Cand.Intf.last());
+ // Will the weight of the local interval be higher than the cheapest evictee
+ // weight? If so it will evict it and will not cause a spill.
+ if (splitArtifactWeight >= 0 && splitArtifactWeight > CheapestEvictWeight)
+ return false;
+ }
+
+ // The local interval is not able to find non interferening assignment and not
+ // able to evict a less worthy interval, therfore, it can cause a spill.
+ return true;
+}
+
/// calcGlobalSplitCost - Return the global split cost of following the split
/// pattern in LiveBundles. This cost should be added to the local cost of the
/// interference pattern in SplitConstraints.
@@ -1499,19 +1550,26 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
Cand.Intf.moveToBlock(BC.Number);
// Check wheather a local interval is going to be created during the region
- // split.
- if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
- Cand.Intf.hasInterference() && BI.LiveIn && BI.LiveOut && RegIn &&
- RegOut) {
-
- if (splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
- // This interfernce cause our eviction from this assignment, we might
- // evict somebody else, add that cost.
+ // split. Calculate adavanced spilt cost (cost of local intervals) if option
+ // is enabled.
+ if (EnableAdvancedRASplitCost && Cand.Intf.hasInterference() && BI.LiveIn &&
+ BI.LiveOut && RegIn && RegOut) {
+
+ if (CanCauseEvictionChain &&
+ splitCanCauseEvictionChain(VirtRegToSplit, Cand, BC.Number, Order)) {
+ // This interference causes our eviction from this assignment, we might
+ // evict somebody else and eventually someone will spill, add that cost.
// See splitCanCauseEvictionChain for detailed description of scenarios.
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
*CanCauseEvictionChain = true;
+
+ } else if (splitCanCauseLocalSpill(VirtRegToSplit, Cand, BC.Number,
+ Order)) {
+ // This interference causes local interval to spill, add that cost.
+ GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
+ GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
}
}
@@ -1540,7 +1598,7 @@ BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand,
// region split.
if (EnableAdvancedRASplitCost && CanCauseEvictionChain &&
splitCanCauseEvictionChain(VirtRegToSplit, Cand, Number, Order)) {
- // This interfernce cause our eviction from this assignment, we might
+ // This interference cause our eviction from this assignment, we might
// evict somebody else, add that cost.
// See splitCanCauseEvictionChain for detailed description of
// scenarios.
diff --git a/llvm/test/CodeGen/X86/bug26810.ll b/llvm/test/CodeGen/X86/bug26810.ll
index 263008131e7..fbb8730026e 100644
--- a/llvm/test/CodeGen/X86/bug26810.ll
+++ b/llvm/test/CodeGen/X86/bug26810.ll
@@ -22,9 +22,10 @@
; CHECK: bb.2.for.body:
; CHECK: SUBPDrr
; CHECK-NEXT: MOVAPSmr
-; CHECK-NEXT: MOVAPSrm
; CHECK-NEXT: MULPDrm
+; CHECK-NEXT: MOVAPSrm
; CHECK-NEXT: ADDPDrr
+; CHECK-NEXT: MOVAPSmr
; CHECK-NEXT: ADD32ri8
target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
new file mode 100644
index 00000000000..4ebc9d81e22
--- /dev/null
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -0,0 +1,89 @@
+; RUN: llc < %s -march=x86 -regalloc=greedy --debug-only=regalloc 2>&1 | FileCheck %s
+
+; This test is meant to make sure that the weight of local intervals that are
+; created during split is taken into account when choosing the best candidate
+; register.
+; %shl is the interval that will be split.
+; The inline assembly calls interfere with %shl and make only 2 available split
+; candidates - %esi and %ebp.
+; The old code would have chosen %esi as the split candidate ignoring the fact
+; that this choice will cause the creation of a local interval that will have a
+; certain spill cost.
+; The new code choses %ebp as the split candidate as it has lower spill cost.
+
+; Make sure the split behaves as expected
+; CHECK: RS_Split Cascade 1
+; CHECK-NOT: %eax static =
+; CHECK: %eax no positive bundles
+; CHECK-NEXT: %ecx no positive bundles
+; CHECK-NEXT: %edx no positive bundles
+; CHECK-NEXT: %esi static =
+; CHECK-NEXT: %edi no positive bundles
+; CHECK-NEXT: %ebx no positive bundles
+; CHECK-NEXT: %ebp static =
+; CHECK: Split for %ebp
+
+; Function Attrs: nounwind
+define i32 @foo(i32* %array, i32 %cond1, i32 %val) local_unnamed_addr #0 {
+entry:
+ %array.addr = alloca i32*, align 4
+ store i32* %array, i32** %array.addr, align 4, !tbaa !3
+ %0 = load i32, i32* %array, align 4, !tbaa !7
+ %arrayidx1 = getelementptr inbounds i32, i32* %array, i32 1
+ %1 = load i32, i32* %arrayidx1, align 4, !tbaa !7
+ %arrayidx2 = getelementptr inbounds i32, i32* %array, i32 2
+ %2 = load i32, i32* %arrayidx2, align 4, !tbaa !7
+ %arrayidx3 = getelementptr inbounds i32, i32* %array, i32 3
+ %3 = load i32, i32* %arrayidx3, align 4, !tbaa !7
+ %arrayidx4 = getelementptr inbounds i32, i32* %array, i32 4
+ %4 = load i32, i32* %arrayidx4, align 4, !tbaa !7
+ %arrayidx6 = getelementptr inbounds i32, i32* %array, i32 %val
+ %5 = load i32, i32* %arrayidx6, align 4, !tbaa !7
+ %shl = shl i32 %5, 5
+ %tobool = icmp eq i32 %cond1, 0
+ br i1 %tobool, label %if.else, label %if.then
+
+if.then: ; preds = %entry
+ %arrayidx7 = getelementptr inbounds i32, i32* %array, i32 6
+ store i32 %shl, i32* %arrayidx7, align 4, !tbaa !7
+ call void asm "nop", "=*m,r,r,r,r,r,*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32** nonnull %array.addr) #1, !srcloc !9
+ %6 = load i32*, i32** %array.addr, align 4, !tbaa !3
+ %arrayidx8 = getelementptr inbounds i32, i32* %6, i32 7
+ br label %if.end
+
+if.else: ; preds = %entry
+ %arrayidx5 = getelementptr inbounds i32, i32* %array, i32 5
+ %7 = load i32, i32* %arrayidx5, align 4, !tbaa !7
+ %arrayidx9 = getelementptr inbounds i32, i32* %array, i32 8
+ store i32 %shl, i32* %arrayidx9, align 4, !tbaa !7
+ call void asm "nop", "=*m,{ax},{bx},{cx},{dx},{di},{si},{ebp},*m,~{dirflag},~{fpsr},~{flags}"(i32** nonnull %array.addr, i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %7, i32* undef, i32** nonnull %array.addr) #1, !srcloc !10
+ %8 = load i32*, i32** %array.addr, align 4, !tbaa !3
+ %arrayidx10 = getelementptr inbounds i32, i32* %8, i32 9
+ br label %if.end
+
+if.end: ; preds = %if.else, %if.then
+ %arrayidx10.sink = phi i32* [ %arrayidx10, %if.else ], [ %arrayidx8, %if.then ]
+ %9 = phi i32* [ %8, %if.else ], [ %6, %if.then ]
+ store i32 %shl, i32* %arrayidx10.sink, align 4, !tbaa !7
+ %10 = load i32, i32* %9, align 4, !tbaa !7
+ %add = add nsw i32 %10, %shl
+ ret i32 %add
+}
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 6.0.0"}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8, !8, i64 0}
+!8 = !{!"int", !5, i64 0}
+!9 = !{i32 268}
+!10 = !{i32 390}
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 3524c4aab1d..382eba3d652 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -148,21 +148,21 @@ define i32 @sad_32i8() nounwind {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: pxor %xmm12, %xmm12
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm6, %xmm6
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm14, %xmm14
+; SSE2-NEXT: pxor %xmm13, %xmm13
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: pxor %xmm15, %xmm15
-; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB1_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa a+1040(%rax), %xmm8
; SSE2-NEXT: movdqa a+1024(%rax), %xmm3
; SSE2-NEXT: movdqa %xmm3, %xmm4
@@ -216,61 +216,65 @@ define i32 @sad_32i8() nounwind {
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm7
; SSE2-NEXT: pxor %xmm6, %xmm7
-; SSE2-NEXT: paddd %xmm7, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm7, %xmm6
+; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, %xmm6
; SSE2-NEXT: psrad $31, %xmm6
; SSE2-NEXT: paddd %xmm6, %xmm4
; SSE2-NEXT: pxor %xmm6, %xmm4
; SSE2-NEXT: movdqa %xmm10, %xmm6
-; SSE2-NEXT: paddd %xmm4, %xmm6
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm4, %xmm7
+; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm1, %xmm4
; SSE2-NEXT: psrad $31, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: pxor %xmm4, %xmm1
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: paddd %xmm1, %xmm6
; SSE2-NEXT: movdqa %xmm3, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm3
; SSE2-NEXT: pxor %xmm1, %xmm3
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm3, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: paddd %xmm5, %xmm14
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm5, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm0, %xmm15
; SSE2-NEXT: movdqa %xmm2, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm2
; SSE2-NEXT: pxor %xmm0, %xmm2
-; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm8, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm8
; SSE2-NEXT: pxor %xmm0, %xmm8
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm8, %xmm0
+; SSE2-NEXT: paddd %xmm8, %xmm14
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB1_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd %xmm15, %xmm6
-; SSE2-NEXT: paddd %xmm0, %xmm3
-; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm15, %xmm0
; SSE2-NEXT: paddd %xmm14, %xmm13
-; SSE2-NEXT: paddd %xmm1, %xmm4
-; SSE2-NEXT: paddd %xmm3, %xmm4
-; SSE2-NEXT: paddd %xmm13, %xmm4
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1]
-; SSE2-NEXT: paddd %xmm4, %xmm0
+; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm13, %xmm6
+; SSE2-NEXT: paddd %xmm0, %xmm6
+; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1]
+; SSE2-NEXT: paddd %xmm6, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: movd %xmm1, %eax
@@ -400,43 +404,43 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: subq $200, %rsp
; SSE2-NEXT: pxor %xmm14, %xmm14
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT: pxor %xmm15, %xmm15
-; SSE2-NEXT: pxor %xmm10, %xmm10
-; SSE2-NEXT: pxor %xmm3, %xmm3
-; SSE2-NEXT: pxor %xmm5, %xmm5
-; SSE2-NEXT: pxor %xmm13, %xmm13
-; SSE2-NEXT: pxor %xmm1, %xmm1
-; SSE2-NEXT: pxor %xmm8, %xmm8
; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: pxor %xmm2, %xmm2
-; SSE2-NEXT: pxor %xmm11, %xmm11
-; SSE2-NEXT: pxor %xmm4, %xmm4
-; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: pxor %xmm7, %xmm7
-; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB2_1: # %vector.body
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movaps a+1040(%rax), %xmm0
-; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa a+1024(%rax), %xmm12
; SSE2-NEXT: movdqa a+1056(%rax), %xmm15
; SSE2-NEXT: movdqa a+1072(%rax), %xmm4
@@ -497,7 +501,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa %xmm0, %xmm3
; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7]
; SSE2-NEXT: psubd %xmm3, %xmm2
-; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
; SSE2-NEXT: psubd %xmm0, %xmm15
; SSE2-NEXT: movdqa %xmm7, %xmm0
@@ -506,7 +510,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3]
; SSE2-NEXT: psubd %xmm3, %xmm9
; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm2, %xmm9
; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7]
@@ -539,7 +543,7 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7]
; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
; SSE2-NEXT: psubd %xmm13, %xmm2
-; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa %xmm2, %xmm13
; SSE2-NEXT: movdqa %xmm1, %xmm3
; SSE2-NEXT: psrad $31, %xmm3
; SSE2-NEXT: paddd %xmm3, %xmm1
@@ -554,15 +558,13 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm6, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm5, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm5
; SSE2-NEXT: pxor %xmm1, %xmm5
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm5, %xmm1
-; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm4, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm4
@@ -570,7 +572,6 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm4, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm8, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm8
@@ -578,7 +579,6 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm8, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
; SSE2-NEXT: movdqa %xmm11, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm11
@@ -587,55 +587,64 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: paddd %xmm11, %xmm1
; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: paddd %xmm4, %xmm3
-; SSE2-NEXT: movdqa %xmm3, %xmm11
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm15, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm15
; SSE2-NEXT: pxor %xmm1, %xmm15
-; SSE2-NEXT: paddd %xmm15, %xmm2
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm4, %xmm1
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm15, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm4
-; SSE2-NEXT: pxor %xmm1, %xmm4
-; SSE2-NEXT: paddd %xmm4, %xmm6
-; SSE2-NEXT: movdqa %xmm6, %xmm15
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm10, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm10
; SSE2-NEXT: pxor %xmm1, %xmm10
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd %xmm10, %xmm1
-; SSE2-NEXT: movdqa %xmm1, %xmm10
-; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm6, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: movdqa %xmm2, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
-; SSE2-NEXT: paddd %xmm1, %xmm6
-; SSE2-NEXT: pxor %xmm1, %xmm6
-; SSE2-NEXT: paddd %xmm6, %xmm3
+; SSE2-NEXT: paddd %xmm1, %xmm2
+; SSE2-NEXT: pxor %xmm1, %xmm2
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm12, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm12
; SSE2-NEXT: pxor %xmm1, %xmm12
-; SSE2-NEXT: paddd %xmm12, %xmm5
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm12, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm0, %xmm1
; SSE2-NEXT: psrad $31, %xmm1
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pxor %xmm1, %xmm0
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm0, %xmm13
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm9, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm9
; SSE2-NEXT: pxor %xmm0, %xmm9
-; SSE2-NEXT: paddd %xmm9, %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd %xmm9, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: movdqa %xmm7, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
; SSE2-NEXT: paddd %xmm0, %xmm7
@@ -643,32 +652,40 @@ define i32 @sad_avx64i8() nounwind {
; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd %xmm7, %xmm0
; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
-; SSE2-NEXT: movdqa %xmm7, %xmm0
+; SSE2-NEXT: movdqa %xmm13, %xmm1
+; SSE2-NEXT: movdqa %xmm1, %xmm0
; SSE2-NEXT: psrad $31, %xmm0
-; SSE2-NEXT: paddd %xmm0, %xmm7
-; SSE2-NEXT: pxor %xmm0, %xmm7
+; SSE2-NEXT: paddd %xmm0, %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm1
; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT: paddd %xmm7, %xmm0
+; SSE2-NEXT: paddd %xmm1, %xmm0
+; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
; SSE2-NEXT: addq $4, %rax
; SSE2-NEXT: jne .LBB2_1
; SSE2-NEXT: # %bb.2: # %middle.block
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm3, %xmm8
-; SSE2-NEXT: paddd %xmm2, %xmm15
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm8, %xmm13
-; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
-; SSE2-NEXT: paddd %xmm5, %xmm0
-; SSE2-NEXT: paddd %xmm11, %xmm10
+; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
; SSE2-NEXT: paddd %xmm0, %xmm1
-; SSE2-NEXT: paddd %xmm10, %xmm1
-; SSE2-NEXT: paddd %xmm13, %xmm1
-; SSE2-NEXT: paddd %xmm15, %xmm1
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm1, %xmm3
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm1, %xmm4
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT: paddd %xmm4, %xmm1
+; SSE2-NEXT: paddd %xmm2, %xmm1
+; SSE2-NEXT: paddd %xmm3, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE2-NEXT: paddd %xmm1, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
OpenPOWER on IntegriCloud