summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp44
-rw-r--r--llvm/test/CodeGen/X86/avx-splat.ll2
-rw-r--r--llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll8
-rw-r--r--llvm/test/CodeGen/X86/extractelement-load.ll4
-rw-r--r--llvm/test/CodeGen/X86/vec_extract.ll2
-rw-r--r--llvm/test/CodeGen/X86/vec_shuffle-38.ll2
-rw-r--r--llvm/test/CodeGen/X86/vec_splat.ll2
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll43
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll12
9 files changed, 88 insertions, 31 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f04a7810a13..f3774321a07 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -19301,26 +19301,52 @@ static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef<int> Mask,
// Use the float domain if the operand type is a floating point type.
bool FloatDomain = VT.isFloatingPoint();
- // If we don't have access to VEX encodings, the generic PSHUF instructions
- // are preferable to some of the specialized forms despite requiring one more
- // byte to encode because they can implicitly copy.
+ // For floating point shuffles, we don't have free copies in the shuffle
+ // instructions, so this always makes sense to canonicalize.
//
- // IF we *do* have VEX encodings, than we can use shorter, more specific
+ // For integer shuffles, if we don't have access to VEX encodings, the generic
+ // PSHUF instructions are preferable to some of the specialized forms despite
+ // requiring one more byte to encode because they can implicitly copy.
+ //
+ // IF we *do* have VEX encodings, then we can use shorter, more specific
// shuffle instructions freely as they can copy due to the extra register
// operand.
- if (Subtarget->hasAVX()) {
+ if (FloatDomain || Subtarget->hasAVX()) {
// We have both floating point and integer variants of shuffles that dup
// either the low or high half of the vector.
if (Mask.equals(0, 0) || Mask.equals(1, 1)) {
bool Lo = Mask.equals(0, 0);
- unsigned Shuffle = FloatDomain ? (Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS)
- : (Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH);
+ unsigned Shuffle;
+ MVT ShuffleVT;
+ // If the input is a floating point, check if we have SSE3 which will let
+ // us use MOVDDUP. That instruction is no slower than UNPCKLPD but has the
+ // option to fold the input operand into even an unaligned memory load.
+ if (FloatDomain && Lo && Subtarget->hasSSE3()) {
+ Shuffle = X86ISD::MOVDDUP;
+ ShuffleVT = MVT::v2f64;
+ } else if (FloatDomain) {
+ // We have MOVLHPS and MOVHLPS throughout SSE and they encode smaller
+ // than the UNPCK variants.
+ Shuffle = Lo ? X86ISD::MOVLHPS : X86ISD::MOVHLPS;
+ ShuffleVT = MVT::v4f32;
+ } else if (Subtarget->hasSSE2()) {
+ // We model everything else using UNPCK instructions. While MOVLHPS and
+ // MOVHLPS are shorter encodings they cannot accept a memory operand
+ // which overly constrains subsequent lowering.
+ Shuffle = Lo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+ ShuffleVT = MVT::v2i64;
+ } else {
+ // No available instructions here.
+ return false;
+ }
if (Depth == 1 && Root->getOpcode() == Shuffle)
return false; // Nothing to do!
- MVT ShuffleVT = FloatDomain ? MVT::v4f32 : MVT::v2i64;
Op = DAG.getNode(ISD::BITCAST, DL, ShuffleVT, Input);
DCI.AddToWorklist(Op.getNode());
- Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
+ if (Shuffle == X86ISD::MOVDDUP)
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op);
+ else
+ Op = DAG.getNode(Shuffle, DL, ShuffleVT, Op, Op);
DCI.AddToWorklist(Op.getNode());
DCI.CombineTo(Root.getNode(), DAG.getNode(ISD::BITCAST, DL, RootVT, Op),
/*AddTo*/ true);
diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll
index 3856aeac3fd..a2537ce5c04 100644
--- a/llvm/test/CodeGen/X86/avx-splat.ll
+++ b/llvm/test/CodeGen/X86/avx-splat.ll
@@ -30,7 +30,7 @@ entry:
ret <4 x i64> %vecinit6.i
}
-; CHECK: vmovlhps %xmm
+; CHECK: vunpcklpd %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
diff --git a/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll b/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll
index 0ddec2c12fb..84820e4e3c3 100644
--- a/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll
+++ b/llvm/test/CodeGen/X86/combine-vec-shuffle-4.ll
@@ -77,7 +77,7 @@ define <4 x float> @test7(<4 x float> %a) {
; Mask: [0,1,0,1]
; CHECK-NOT: pshufd
; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
; CHECK-NEXT: ret
define <4 x float> @test8(<4 x float> %a) {
@@ -89,7 +89,7 @@ define <4 x float> @test8(<4 x float> %a) {
; Mask: [0,1,0,u]
; CHECK-NOT: pshufd
; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
; CHECK-NEXT: ret
define <4 x float> @test9(<4 x float> %a) {
@@ -196,7 +196,7 @@ define <4 x float> @test17(<4 x float> %a) {
; Mask: [0,1,0,1]
; CHECK-NOT: pshufd
; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
; CHECK-NEXT: ret
define <4 x float> @test18(<4 x float> %a) {
@@ -208,7 +208,7 @@ define <4 x float> @test18(<4 x float> %a) {
; Mask: [0,1,0,u]
; CHECK-NOT: pshufd
; CHECK-NOT: shufps
-; CHECK: movlhps
+; CHECK: unpcklpd
; CHECK-NEXT: ret
define <4 x float> @test19(<4 x float> %a) {
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 3e31b4b190b..0d5d299ed10 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -36,7 +36,9 @@ define void @t3() {
;
; This movs the entire vector, shuffling the high double down. If we fixed the
; FIXME above it would just move the high double directly.
-; CHECK: movhpd %xmm
+; CHECK: movups
+; CHECK: movhlps
+; CHECK: movlps
bb:
%tmp13 = load <2 x double>* undef, align 1
diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll
index 93380427f49..6391ef61682 100644
--- a/llvm/test/CodeGen/X86/vec_extract.ll
+++ b/llvm/test/CodeGen/X86/vec_extract.ll
@@ -41,7 +41,7 @@ entry:
define double @test4(double %A) nounwind {
; CHECK-LABEL: test4:
; CHECK: calll {{.*}}foo
-; CHECK-NEXT: unpckhpd %[[X:xmm[0-9]+]], %[[X]]
+; CHECK-NEXT: movhlps %[[X:xmm[0-9]+]], %[[X]]
; CHECK-NEXT: addsd {{.*}}(%{{.*}}), %[[X2]]
; CHECK-NEXT: movsd %[[X2]], [[mem:.*\(%.*\)]]
; CHECK-NEXT: fldl [[mem]]
diff --git a/llvm/test/CodeGen/X86/vec_shuffle-38.ll b/llvm/test/CodeGen/X86/vec_shuffle-38.ll
index ec196df7aef..7e4f747f735 100644
--- a/llvm/test/CodeGen/X86/vec_shuffle-38.ll
+++ b/llvm/test/CodeGen/X86/vec_shuffle-38.ll
@@ -7,7 +7,7 @@ define <2 x double> @ld(<2 x double> %p) nounwind optsize ssp {
}
define <2 x double> @hd(<2 x double> %p) nounwind optsize ssp {
-; CHECK: unpckhpd
+; CHECK: movhlps
%shuffle = shufflevector <2 x double> %p, <2 x double> undef, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
}
diff --git a/llvm/test/CodeGen/X86/vec_splat.ll b/llvm/test/CodeGen/X86/vec_splat.ll
index 28f2a9074cb..07eeb3575c7 100644
--- a/llvm/test/CodeGen/X86/vec_splat.ll
+++ b/llvm/test/CodeGen/X86/vec_splat.ll
@@ -28,7 +28,7 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
ret void
; SSE2-LABEL: test_v2sd:
-; SSE2: shufpd $0
+; SSE2: movlhps
; SSE3-LABEL: test_v2sd:
; SSE3: movddup
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index d0e8dfd242a..49d2eeb482f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse3 -x86-experimental-vector-shuffle-lowering | FileCheck %s --check-prefix=CHECK-SSE3
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
@@ -48,7 +49,7 @@ define <2 x i64> @shuffle_v2i64_33(<2 x i64> %a, <2 x i64> %b) {
define <2 x double> @shuffle_v2f64_00(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_00
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2: movlhps {{.*}} # xmm0 = xmm0[0,0]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 0>
ret <2 x double> %shuffle
@@ -62,17 +63,15 @@ define <2 x double> @shuffle_v2f64_10(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_11(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_11
-; CHECK-SSE2: shufpd {{.*}} # xmm0 = xmm0[1,1]
+; CHECK-SSE2: movhlps {{.*}} # xmm0 = xmm0[1,1]
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 1, i32 1>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_22(<2 x double> %a, <2 x double> %b) {
-; FIXME: Should these use movapd + shufpd to remove a domain change at the cost
-; of a mov?
-;
; CHECK-SSE2-LABEL: @shuffle_v2f64_22
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[0,1,0,1]
+; CHECK-SSE2: movlhps {{.*}} # xmm1 = xmm1[0,0]
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 2>
ret <2 x double> %shuffle
@@ -86,7 +85,8 @@ define <2 x double> @shuffle_v2f64_32(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
; CHECK-SSE2-LABEL: @shuffle_v2f64_33
-; CHECK-SSE2: pshufd {{.*}} # xmm0 = xmm1[2,3,2,3]
+; CHECK-SSE2: movhlps {{.*}} # xmm1 = xmm1[1,1]
+; CHECK-SSE2-NEXT: movaps %xmm1, %xmm0
; CHECK-SSE2-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 3, i32 3>
ret <2 x double> %shuffle
@@ -217,3 +217,32 @@ define <2 x i64> @shuffle_v2i64_31_copy(<2 x i64> %nonce, <2 x i64> %a, <2 x i64
%shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 3, i32 1>
ret <2 x i64> %shuffle
}
+
+
+define <2 x double> @insert_dup_reg_v2f64(double %a) {
+; CHECK-SSE2-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE2: movlhps {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT: retq
+;
+; FIXME: This should match movddup as well!
+; CHECK-SSE3-LABEL: @insert_dup_reg_v2f64
+; CHECK-SSE3: unpcklpd {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE3-NEXT: retq
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %shuffle
+}
+define <2 x double> @insert_dup_mem_v2f64(double* %ptr) {
+; CHECK-SSE2-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE2: movsd {{.*}}, %xmm0
+; CHECK-SSE2-NEXT: movlhps {{.*}} # xmm0 = xmm0[0,0]
+; CHECK-SSE2-NEXT: retq
+;
+; CHECK-SSE3-LABEL: @insert_dup_mem_v2f64
+; CHECK-SSE3: movddup {{.*}}, %xmm0
+; CHECK-SSE3-NEXT: retq
+ %a = load double* %ptr
+ %v = insertelement <2 x double> undef, double %a, i32 0
+ %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+ ret <2 x double> %shuffle
+}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index cde96dbb30f..a21b78985d7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -98,7 +98,7 @@ define <4 x i64> @shuffle_v4i64_3210(<4 x i64> %a, <4 x i64> %b) {
define <4 x double> @shuffle_v4f64_0001(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_0001
; AVX1: # BB#0:
-; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm0[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
@@ -109,7 +109,7 @@ define <4 x double> @shuffle_v4f64_0020(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0],xmm0[0]
-; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 0, i32 2, i32 0>
@@ -120,7 +120,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[0],xmm1[1]
-; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 3, i32 0, i32 0>
@@ -130,7 +130,7 @@ define <4 x double> @shuffle_v4f64_1000(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_1000
; AVX1: # BB#0:
; AVX1-NEXT: vshufpd {{.*}} # xmm1 = xmm0[1,0]
-; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 1, i32 0, i32 0, i32 0>
@@ -140,8 +140,8 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
; AVX1-LABEL: @shuffle_v4f64_2200
; AVX1: # BB#0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vmovlhps {{.*}} # xmm1 = xmm1[0,0]
-; AVX1-NEXT: vmovlhps {{.*}} # xmm0 = xmm0[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm1 = xmm1[0,0]
+; AVX1-NEXT: vunpcklpd {{.*}} # xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
%shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 0, i32 0>
OpenPOWER on IntegriCloud