summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSimon Pilgrim <llvm-dev@redking.me.uk>2016-08-22 12:56:54 +0000
committerSimon Pilgrim <llvm-dev@redking.me.uk>2016-08-22 12:56:54 +0000
commit2279e595737524c6133fddad91634a28c12cd07c (patch)
treea37d3d4c092eb008896bf2ab08b04a3a7cb3f579
parentf0ed16eae58479155639e601bbfbff961f2cc49a (diff)
downloadbcm5719-llvm-2279e595737524c6133fddad91634a28c12cd07c.tar.gz
bcm5719-llvm-2279e595737524c6133fddad91634a28c12cd07c.zip
[X86][SSE] Avoid specifying unused arguments in SHUFPD lowering
As discussed on PR26491, we are missing the opportunity to make use of the smaller MOVHLPS instruction because we set both arguments of a SHUFPD when using it to lower a single input shuffle. This patch sets the lowered argument to UNDEF if that shuffle element is undefined. This in turn makes it easier for target shuffle combining to decode UNDEF shuffle elements, allowing combines to MOVHLPS to occur. A fix to match against MOVHPD stores was necessary as well. This builds on the improved MOVLHPS/MOVHLPS lowering and memory folding support added in D16956 Adding similar support for SHUFPS will have to wait until have better support for target combining of binary shuffles. Differential Revision: https://reviews.llvm.org/D23027 llvm-svn: 279430
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp7
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td12
-rw-r--r--llvm/test/CodeGen/X86/haddsub-2.ll16
-rw-r--r--llvm/test/CodeGen/X86/haddsub-undef.ll6
-rw-r--r--llvm/test/CodeGen/X86/nontemporal-2.ll2
-rw-r--r--llvm/test/CodeGen/X86/pr11334.ll14
-rw-r--r--llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll4
-rw-r--r--llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll14
-rw-r--r--llvm/test/CodeGen/X86/sse_partial_update.ll4
-rw-r--r--llvm/test/CodeGen/X86/vec_extract.ll8
-rw-r--r--llvm/test/CodeGen/X86/vec_fp_to_int.ll310
-rw-r--r--llvm/test/CodeGen/X86/vector-rem.ll8
-rw-r--r--llvm/test/CodeGen/X86/widen_conv-3.ll4
-rw-r--r--llvm/test/CodeGen/X86/widen_conv-4.ll8
14 files changed, 216 insertions, 201 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 89f139765b0..ddc95dce734 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9004,8 +9004,11 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
- return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V1,
- DAG.getConstant(SHUFPDMask, DL, MVT::i8));
+ return DAG.getNode(
+ X86ISD::SHUFP, DL, MVT::v2f64,
+ Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
+ DAG.getConstant(SHUFPDMask, DL, MVT::i8));
}
assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
assert(Mask[1] >= 2 && "Non-canonicalized blend!");
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 72879ea44af..a215a80d3ce 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -1300,6 +1300,7 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1307,6 +1308,11 @@ let Predicates = [UseAVX] in {
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (VMOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(VMOVHPDmr addr:$dst, VR128:$src)>;
@@ -1332,6 +1338,7 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
+
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1339,6 +1346,11 @@ let Predicates = [UseSSE2] in {
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
+ (iPTR 0))), addr:$dst),
+ (MOVHPDmr addr:$dst, VR128:$src)>;
+
+ def : Pat<(store (f64 (extractelt
(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
(iPTR 0))), addr:$dst),
(MOVHPDmr addr:$dst, VR128:$src)>;
diff --git a/llvm/test/CodeGen/X86/haddsub-2.ll b/llvm/test/CodeGen/X86/haddsub-2.ll
index 517a663bc81..b670b925762 100644
--- a/llvm/test/CodeGen/X86/haddsub-2.ll
+++ b/llvm/test/CodeGen/X86/haddsub-2.ll
@@ -907,9 +907,9 @@ define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: not_a_hsub_2:
; SSE: # BB#0:
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm3
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
; SSE-NEXT: subss %xmm3, %xmm2
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -917,7 +917,7 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: movaps %xmm1, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
; SSE-NEXT: movaps %xmm1, %xmm4
-; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: subss %xmm4, %xmm3
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
@@ -964,11 +964,11 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: not_a_hsub_3:
; SSE: # BB#0:
-; SSE-NEXT: movapd %xmm1, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: movaps %xmm1, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: subsd %xmm2, %xmm1
-; SSE-NEXT: movapd %xmm0, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: movaps %xmm0, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: subsd %xmm0, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movapd %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 5e2e50893d0..6d79d4de520 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -102,8 +102,8 @@ define <4 x float> @test4_undef(<4 x float> %a, <4 x float> %b) {
define <2 x double> @test5_undef(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: test5_undef:
; SSE: # BB#0:
-; SSE-NEXT: movapd %xmm0, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movaps %xmm0, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: addsd %xmm0, %xmm1
; SSE-NEXT: movapd %xmm1, %xmm0
; SSE-NEXT: retq
@@ -168,7 +168,7 @@ define <4 x float> @test8_undef(<4 x float> %a, <4 x float> %b) {
; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm0, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: addss %xmm2, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/nontemporal-2.ll b/llvm/test/CodeGen/X86/nontemporal-2.ll
index accc1f8bbea..42030484851 100644
--- a/llvm/test/CodeGen/X86/nontemporal-2.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-2.ll
@@ -563,7 +563,7 @@ define void @test_extract_f64(<2 x double> %arg, double* %dst) {
;
; SSE4A-LABEL: test_extract_f64:
; SSE4A: # BB#0:
-; SSE4A-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE4A-NEXT: movntsd %xmm0, (%rdi)
; SSE4A-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll
index 1f878645607..eff814575db 100644
--- a/llvm/test/CodeGen/X86/pr11334.ll
+++ b/llvm/test/CodeGen/X86/pr11334.ll
@@ -21,13 +21,13 @@ define <3 x double> @v3f2d_ext_vec(<3 x float> %v1) nounwind {
; SSE-LABEL: v3f2d_ext_vec:
; SSE: # BB#0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvtps2pd %xmm0, %xmm0
; SSE-NEXT: movlps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movapd %xmm2, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movaps %xmm2, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: fldl -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movapd %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: v3f2d_ext_vec:
@@ -43,7 +43,7 @@ define <4 x double> @v4f2d_ext_vec(<4 x float> %v1) nounwind {
; SSE-LABEL: v4f2d_ext_vec:
; SSE: # BB#0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvtps2pd %xmm0, %xmm1
; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: retq
@@ -62,9 +62,9 @@ define <8 x double> @v8f2d_ext_vec(<8 x float> %v1) nounwind {
; SSE: # BB#0: # %entry
; SSE-NEXT: cvtps2pd %xmm0, %xmm5
; SSE-NEXT: cvtps2pd %xmm1, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; SSE-NEXT: cvtps2pd %xmm0, %xmm4
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: cvtps2pd %xmm1, %xmm3
; SSE-NEXT: movaps %xmm5, %xmm0
; SSE-NEXT: movaps %xmm4, %xmm1
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index d3ebba93c76..9d1ab922d96 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -3233,13 +3233,13 @@ define void @test_mm_storeh_sd(double *%a0, <2 x double> %a1) {
; X32-LABEL: test_mm_storeh_sd:
; X32: # BB#0:
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-NEXT: movsd %xmm0, (%eax)
; X32-NEXT: retl
;
; X64-LABEL: test_mm_storeh_sd:
; X64: # BB#0:
-; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-NEXT: movsd %xmm0, (%rdi)
; X64-NEXT: retq
%ext = extractelement <2 x double> %a1, i32 1
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index 3f47d987aed..4d895ea264c 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -267,8 +267,8 @@ define <4 x float> @test10(<4 x float> %A, <4 x float> %B) {
define <4 x float> @test11(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: test11:
; SSE: # BB#0:
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE-NEXT: retq
@@ -339,8 +339,8 @@ define <4 x float> @test14(<4 x float> %A, <4 x float> %B) {
; SSE: # BB#0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm1, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; SSE-NEXT: subss %xmm1, %xmm0
; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,1,3]
@@ -408,9 +408,9 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: subss %xmm0, %xmm2
; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1,0]
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1,0]
+; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
; SSE-NEXT: subss %xmm4, %xmm3
; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
; SSE-NEXT: addss %xmm0, %xmm4
diff --git a/llvm/test/CodeGen/X86/sse_partial_update.ll b/llvm/test/CodeGen/X86/sse_partial_update.ll
index 51359d1790a..bd207c99dbd 100644
--- a/llvm/test/CodeGen/X86/sse_partial_update.ll
+++ b/llvm/test/CodeGen/X86/sse_partial_update.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -mcpu=nehalem | FileCheck %s
; rdar: 12558838
@@ -77,7 +77,7 @@ define void @sqrtsd(<2 x double> %a) nounwind uwtable ssp {
; CHECK: ## BB#0: ## %entry
; CHECK-NEXT: sqrtsd %xmm0, %xmm0
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm2
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; CHECK-NEXT: cvtsd2ss %xmm0, %xmm1
; CHECK-NEXT: movaps %xmm2, %xmm0
; CHECK-NEXT: jmp _callee2 ## TAILCALL
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 47f719d9e32..58d8392b235 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -33,7 +33,7 @@ define float @test2(<4 x float>* %F, float* %f) nounwind {
; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-NEXT: movaps (%eax), %xmm0
; X32-NEXT: addps %xmm0, %xmm0
-; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-NEXT: movss %xmm0, (%esp)
; X32-NEXT: flds (%esp)
; X32-NEXT: popl %eax
@@ -43,7 +43,7 @@ define float @test2(<4 x float>* %F, float* %f) nounwind {
; X64: # BB#0: # %entry
; X64-NEXT: movaps (%rdi), %xmm0
; X64-NEXT: addps %xmm0, %xmm0
-; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-NEXT: retq
entry:
%tmp = load <4 x float>, <4 x float>* %F
@@ -78,7 +78,7 @@ define double @test4(double %A) nounwind {
; X32: # BB#0: # %entry
; X32-NEXT: subl $12, %esp
; X32-NEXT: calll foo
-; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X32-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X32-NEXT: addsd {{[0-9]+}}(%esp), %xmm0
; X32-NEXT: movsd %xmm0, (%esp)
; X32-NEXT: fldl (%esp)
@@ -90,7 +90,7 @@ define double @test4(double %A) nounwind {
; X64-NEXT: pushq %rax
; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill
; X64-NEXT: callq foo
-; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload
; X64-NEXT: popq %rax
; X64-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 0ad5ef7ee8f..5e32f2c89c7 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -12,13 +12,13 @@
define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i64:
-; SSE: # BB#0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE: # BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
;
@@ -37,13 +37,13 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptosi_2f64_to_2i32:
-; SSE: # BB#0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE: # BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE-NEXT: retq
;
@@ -64,13 +64,13 @@ define <4 x i32> @fptosi_2f64_to_2i32(<2 x double> %a) {
define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_2i32:
-; SSE: # BB#0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE: # BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: movd %rax, %xmm1
@@ -92,19 +92,19 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_4i64:
-; SSE: # BB#0:
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE: # BB#0:
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: movdqa %xmm3, %xmm1
; SSE-NEXT: retq
@@ -132,20 +132,20 @@ define <4 x i64> @fptosi_4f64_to_4i64(<4 x double> %a) {
define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
; SSE-LABEL: fptosi_4f64_to_4i32:
-; SSE: # BB#0:
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE-NEXT: cvttsd2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttsd2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm0
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE: # BB#0:
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: cvttsd2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttsd2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm0
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
@@ -173,14 +173,14 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm2, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movd %rdx, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm2, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
@@ -224,14 +224,14 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm1, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm1, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rcx
; SSE-NEXT: ucomisd %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rax, %rcx
@@ -277,14 +277,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm1, %xmm0
-; SSE-NEXT: cmovaeq %rax, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm3
-; SSE-NEXT: subsd %xmm1, %xmm3
-; SSE-NEXT: cvttsd2si %xmm3, %rax
-; SSE-NEXT: xorq %rcx, %rax
+; SSE-NEXT: ucomisd %xmm1, %xmm0
+; SSE-NEXT: cmovaeq %rax, %rdx
+; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm3
+; SSE-NEXT: subsd %xmm1, %xmm3
+; SSE-NEXT: cvttsd2si %xmm3, %rax
+; SSE-NEXT: xorq %rcx, %rax
; SSE-NEXT: cvttsd2si %xmm0, %rdx
; SSE-NEXT: ucomisd %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rax, %rdx
@@ -327,14 +327,14 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm2, %rdx
-; SSE-NEXT: ucomisd %xmm3, %xmm2
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm0
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
-; SSE-NEXT: movapd %xmm2, %xmm4
-; SSE-NEXT: subsd %xmm3, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomisd %xmm3, %xmm2
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm0
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT: movaps %xmm2, %xmm4
+; SSE-NEXT: subsd %xmm3, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm2, %rdx
; SSE-NEXT: ucomisd %xmm3, %xmm2
; SSE-NEXT: cmovaeq %rcx, %rdx
@@ -345,14 +345,14 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm2, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rdx
-; SSE-NEXT: ucomisd %xmm3, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: subsd %xmm3, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomisd %xmm3, %xmm1
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: subsd %xmm3, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rax
; SSE-NEXT: ucomisd %xmm3, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rax
@@ -414,14 +414,14 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm1
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE-NEXT: movapd %xmm1, %xmm4
-; SSE-NEXT: subsd %xmm2, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomisd %xmm2, %xmm1
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: movaps %xmm1, %xmm4
+; SSE-NEXT: subsd %xmm2, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm1, %rdx
; SSE-NEXT: ucomisd %xmm2, %xmm1
; SSE-NEXT: cmovaeq %rcx, %rdx
@@ -433,14 +433,14 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
; SSE-NEXT: cvttsd2si %xmm3, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm0, %rdx
-; SSE-NEXT: ucomisd %xmm2, %xmm0
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm4
-; SSE-NEXT: subsd %xmm2, %xmm4
-; SSE-NEXT: cvttsd2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomisd %xmm2, %xmm0
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: subsd %xmm2, %xmm4
+; SSE-NEXT: cvttsd2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttsd2si %xmm0, %rax
; SSE-NEXT: ucomisd %xmm2, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
@@ -565,13 +565,13 @@ define <4 x i64> @fptosi_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: movd %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT: cvttss2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -608,13 +608,13 @@ define <4 x i64> @fptosi_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: movd %rax, %xmm1
; SSE-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movaps %xmm0, %xmm1
-; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %rax, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %rax, %xmm1
-; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT: cvttss2si %xmm1, %rax
+; SSE-NEXT: movd %rax, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movd %rax, %xmm1
+; SSE-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; SSE-NEXT: movdqa %xmm2, %xmm0
; SSE-NEXT: retq
;
@@ -655,13 +655,13 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm2, %rax
; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: cvttss2si %xmm0, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: cvttss2si %xmm0, %rax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE-NEXT: movdqa %xmm1, %xmm0
; SSE-NEXT: retq
@@ -799,13 +799,13 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
-; SSE-NEXT: cvttss2si %xmm2, %rax
-; SSE-NEXT: movd %eax, %xmm0
-; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1,0]
-; SSE-NEXT: cvttss2si %xmm2, %rax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT: cvttss2si %xmm2, %rax
+; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT: cvttss2si %xmm2, %rax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
@@ -815,13 +815,13 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1,2,3]
; SSE-NEXT: cvttss2si %xmm3, %rax
; SSE-NEXT: movd %eax, %xmm3
-; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm2
-; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
-; SSE-NEXT: cvttss2si %xmm1, %rax
-; SSE-NEXT: movd %eax, %xmm1
-; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; SSE-NEXT: cvttss2si %xmm1, %rax
+; SSE-NEXT: movd %eax, %xmm2
+; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT: cvttss2si %xmm1, %rax
+; SSE-NEXT: movd %eax, %xmm1
+; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE-NEXT: movdqa %xmm2, %xmm1
; SSE-NEXT: retq
@@ -888,14 +888,14 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm4, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomiss %xmm1, %xmm3
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: subss %xmm1, %xmm4
+; SSE-NEXT: cvttss2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
@@ -979,14 +979,14 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
; SSE-NEXT: cvttss2si %xmm4, %rcx
; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttss2si %xmm3, %rdx
-; SSE-NEXT: ucomiss %xmm1, %xmm3
-; SSE-NEXT: cmovaeq %rcx, %rdx
-; SSE-NEXT: movd %rdx, %xmm3
-; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; SSE-NEXT: movapd %xmm0, %xmm4
-; SSE-NEXT: subss %xmm1, %xmm4
-; SSE-NEXT: cvttss2si %xmm4, %rcx
-; SSE-NEXT: xorq %rax, %rcx
+; SSE-NEXT: ucomiss %xmm1, %xmm3
+; SSE-NEXT: cmovaeq %rcx, %rdx
+; SSE-NEXT: movd %rdx, %xmm3
+; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT: movaps %xmm0, %xmm4
+; SSE-NEXT: subss %xmm1, %xmm4
+; SSE-NEXT: cvttss2si %xmm4, %rcx
+; SSE-NEXT: xorq %rax, %rcx
; SSE-NEXT: cvttss2si %xmm0, %rax
; SSE-NEXT: ucomiss %xmm1, %xmm0
; SSE-NEXT: cmovaeq %rcx, %rax
diff --git a/llvm/test/CodeGen/X86/vector-rem.ll b/llvm/test/CodeGen/X86/vector-rem.ll
index 866f18e3134..340dd77ec48 100644
--- a/llvm/test/CodeGen/X86/vector-rem.ll
+++ b/llvm/test/CodeGen/X86/vector-rem.ll
@@ -99,10 +99,10 @@ define <4 x float> @qux(<4 x float> %t, <4 x float> %u) nounwind {
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
-; CHECK-NEXT: movapd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; CHECK-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; CHECK-NEXT: callq fmodf
; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
; CHECK-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
diff --git a/llvm/test/CodeGen/X86/widen_conv-3.ll b/llvm/test/CodeGen/X86/widen_conv-3.ll
index e8fa1043e9f..f2e29337e6a 100644
--- a/llvm/test/CodeGen/X86/widen_conv-3.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-3.ll
@@ -74,7 +74,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
; X86-SSE2-NEXT: movaps %xmm0, %xmm1
-; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
@@ -123,7 +123,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: psrad $24, %xmm0
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
-; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
; X64-SSE2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/widen_conv-4.ll b/llvm/test/CodeGen/X86/widen_conv-4.ll
index c0d2b775b72..90c4bbe6bb7 100644
--- a/llvm/test/CodeGen/X86/widen_conv-4.ll
+++ b/llvm/test/CodeGen/X86/widen_conv-4.ll
@@ -19,7 +19,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X86-SSE2-NEXT: movups %xmm0, (%eax)
; X86-SSE2-NEXT: movss %xmm2, 16(%eax)
; X86-SSE2-NEXT: movaps %xmm2, %xmm0
-; X86-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X86-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X86-SSE2-NEXT: movss %xmm0, 24(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm2, 20(%eax)
@@ -49,7 +49,7 @@ define void @convert_v7i16_v7f32(<7 x float>* %dst.addr, <7 x i16> %src) nounwin
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, 16(%rdi)
; X64-SSE2-NEXT: movups %xmm2, (%rdi)
-; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movss %xmm0, 24(%rdi)
; X64-SSE2-NEXT: retq
;
@@ -100,7 +100,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X86-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X86-SSE2-NEXT: movss %xmm0, (%eax)
; X86-SSE2-NEXT: movaps %xmm0, %xmm1
-; X86-SSE2-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1,0]
+; X86-SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; X86-SSE2-NEXT: movss %xmm1, 8(%eax)
; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; X86-SSE2-NEXT: movss %xmm0, 4(%eax)
@@ -148,7 +148,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
; X64-SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; X64-SSE2-NEXT: cvtdq2ps %xmm0, %xmm0
; X64-SSE2-NEXT: movlps %xmm0, (%rdi)
-; X64-SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0]
+; X64-SSE2-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; X64-SSE2-NEXT: movss %xmm0, 8(%rdi)
; X64-SSE2-NEXT: retq
;
OpenPOWER on IntegriCloud