summaryrefslogtreecommitdiffstats
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp5
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td194
-rw-r--r--llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll3
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll88
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll8
-rw-r--r--llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll26
6 files changed, 284 insertions, 40 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b6d134ff0fb..fa48b23ec0c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -7830,6 +7830,11 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
+ // This is essentially a special case blend operation, but if we have
+ // general purpose blend operations, they are always faster. Bail and let
+ // the rest of the lowering handle these as blends.
+ if (Subtarget->hasSSE41())
+ return SDValue();
// Otherwise, use MOVSD or MOVSS.
assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index a2d97456405..03ce09f5f07 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -3125,7 +3125,6 @@ let Predicates = [UseSSE1] in {
let Predicates = [UseSSE2] in {
// SSE2 patterns to select scalar double-precision fp arithmetic instructions
-
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
(f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
@@ -3145,10 +3144,10 @@ let Predicates = [UseSSE2] in {
}
let Predicates = [UseSSE41] in {
- // If the subtarget has SSE4.1 but not AVX, the vector insert
- // instruction is lowered into a X86insertps rather than a X86Movss.
- // When selecting SSE scalar single-precision fp arithmetic instructions,
- // make sure that we correctly match the X86insertps.
+ // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is
+ // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When
+ // selecting SSE scalar single-precision fp arithmetic instructions, make
+ // sure that we correctly match them.
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
@@ -3166,6 +3165,57 @@ let Predicates = [UseSSE41] in {
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
let Predicates = [HasAVX] in {
@@ -3204,6 +3254,57 @@ let Predicates = [HasAVX] in {
(fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))), (iPTR 0))),
(VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
+ (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
+ (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
// Patterns used to select SSE scalar fp arithmetic instructions from
@@ -3258,6 +3359,49 @@ let Predicates = [UseSSE2] in {
(DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
}
+let Predicates = [UseSSE41] in {
+ // With SSE4.1 we may see these operations using X86Blendi rather than
+ // X86Movs{s,d}.
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+}
+
let Predicates = [HasAVX] in {
// The following patterns select AVX Scalar single/double precision fp
// arithmetic instructions from a packed single precision fp instruction
@@ -3287,6 +3431,46 @@ let Predicates = [HasAVX] in {
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
(fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
(VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ // Also handle X86Blendi-based patterns.
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+
+ def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
+ def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
}
/// Unop Arithmetic
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index b122ef67544..415a4f12b2c 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1,6 +1,9 @@
; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse2 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE2 %s
; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
+; RUN: llc -mcpu=x86-64 -mattr=+sse4.1 < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=SSE --check-prefix=SSE41 %s
; RUN: llc -mcpu=x86-64 -mattr=+avx < %s | FileCheck --check-prefix=AVX %s
+; RUN: llc -mcpu=x86-64 -mattr=+avx < %s -x86-experimental-vector-shuffle-lowering | FileCheck --check-prefix=AVX %s
target triple = "x86_64-unknown-unknown"
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
index aa837f15e57..59041367bba 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -211,28 +211,61 @@ define <2 x double> @shuffle_v2f64_33(<2 x double> %a, <2 x double> %b) {
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_03(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: shuffle_v2f64_03:
-; SSE: # BB#0:
-; SSE-NEXT: movsd %xmm0, %xmm1
-; SSE-NEXT: movaps %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v2f64_03:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm0, %xmm1
+; SSE2-NEXT: movaps %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_03:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm0, %xmm1
+; SSE3-NEXT: movaps %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_03:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm0, %xmm1
+; SSSE3-NEXT: movaps %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_03:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_03:
; AVX: # BB#0:
-; AVX-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 0, i32 3>
ret <2 x double> %shuffle
}
define <2 x double> @shuffle_v2f64_21(<2 x double> %a, <2 x double> %b) {
-; SSE-LABEL: shuffle_v2f64_21:
-; SSE: # BB#0:
-; SSE-NEXT: movsd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v2f64_21:
+; SSE2: # BB#0:
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_21:
+; SSE3: # BB#0:
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_21:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_21:
+; SSE41: # BB#0:
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_21:
; AVX: # BB#0:
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> %b, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
@@ -753,16 +786,35 @@ define <2 x double> @shuffle_v2f64_z0(<2 x double> %a) {
}
define <2 x double> @shuffle_v2f64_z1(<2 x double> %a) {
-; SSE-LABEL: shuffle_v2f64_z1:
-; SSE: # BB#0:
-; SSE-NEXT: xorps %xmm1, %xmm1
-; SSE-NEXT: movsd %xmm1, %xmm0
-; SSE-NEXT: retq
+; SSE2-LABEL: shuffle_v2f64_z1:
+; SSE2: # BB#0:
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: movsd %xmm1, %xmm0
+; SSE2-NEXT: retq
+;
+; SSE3-LABEL: shuffle_v2f64_z1:
+; SSE3: # BB#0:
+; SSE3-NEXT: xorps %xmm1, %xmm1
+; SSE3-NEXT: movsd %xmm1, %xmm0
+; SSE3-NEXT: retq
+;
+; SSSE3-LABEL: shuffle_v2f64_z1:
+; SSSE3: # BB#0:
+; SSSE3-NEXT: xorps %xmm1, %xmm1
+; SSSE3-NEXT: movsd %xmm1, %xmm0
+; SSSE3-NEXT: retq
+;
+; SSE41-LABEL: shuffle_v2f64_z1:
+; SSE41: # BB#0:
+; SSE41-NEXT: xorpd %xmm1, %xmm1
+; SSE41-NEXT: blendpd {{.*#+}} xmm1 = xmm1[0],xmm0[1]
+; SSE41-NEXT: movapd %xmm1, %xmm0
+; SSE41-NEXT: retq
;
; AVX-LABEL: shuffle_v2f64_z1:
; AVX: # BB#0:
-; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT: vmovsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1
+; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; AVX-NEXT: retq
%shuffle = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
ret <2 x double> %shuffle
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 32ee62fa985..7899a52a741 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -55,7 +55,7 @@ define <4 x double> @shuffle_v4f64_0300(<4 x double> %a, <4 x double> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
-; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4f64_0300:
@@ -382,7 +382,7 @@ define <4 x i64> @shuffle_v4i64_0300(<4 x i64> %a, <4 x i64> %b) {
; AVX1: # BB#0:
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[0,1,2,2]
-; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_0300:
@@ -518,7 +518,7 @@ define <4 x i64> @shuffle_v4i64_4012(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vshufpd {{.*#+}} xmm2 = xmm0[1],xmm2[0]
; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: shuffle_v4i64_4012:
@@ -654,7 +654,7 @@ define <4 x i64> @stress_test1(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; AVX1-NEXT: vmovsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
; AVX1-NEXT: retq
;
; AVX2-LABEL: stress_test1:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 662b9832611..2f02f2fc08f 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -91,7 +91,7 @@ define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
; ALL-NEXT: vbroadcastsd %xmm0, %ymm0
-; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm1
+; ALL-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -275,12 +275,12 @@ define <8 x double> @shuffle_v8f64_08192a3b(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_08991abb(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_08991abb:
; ALL: # BB#0:
-; ALL-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,1,1]
-; ALL-NEXT: vmovsd %xmm0, %xmm2, %xmm2
-; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,3,3]
-; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
-; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; ALL-NEXT: vpermilpd {{.*#+}} ymm2 = ymm0[1,0,2,2]
+; ALL-NEXT: vpermpd {{.*#+}} ymm3 = ymm1[0,2,3,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0],ymm3[1,2,3]
+; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,0,1,1]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 8, i32 9, i32 9, i32 1, i32 10, i32 11, i32 11>
ret <8 x double> %shuffle
@@ -303,11 +303,11 @@ define <8 x double> @shuffle_v8f64_091b2d3f(<8 x double> %a, <8 x double> %b) {
define <8 x double> @shuffle_v8f64_09ab1def(<8 x double> %a, <8 x double> %b) {
; ALL-LABEL: shuffle_v8f64_09ab1def:
; ALL: # BB#0:
-; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm2
-; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
-; ALL-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2]
-; ALL-NEXT: vmovsd %xmm0, %xmm1, %xmm0
-; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm2, %zmm0
+; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm2
+; ALL-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,0,2,2]
+; ALL-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3]
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; ALL-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 1, i32 13, i32 14, i32 15>
ret <8 x double> %shuffle
@@ -721,7 +721,7 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3]
; ALL-NEXT: vextractf64x4 $1, %zmm1, %ymm1
; ALL-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[3,1,2,3]
-; ALL-NEXT: vmovsd %xmm1, %xmm0, %xmm0
+; ALL-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
; ALL-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
; ALL-NEXT: retq
%shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 15, i32 5, i32 1, i32 1, i32 2, i32 3, i32 5, i32 10>
OpenPOWER on IntegriCloud