summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td515
-rw-r--r--llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll149
2 files changed, 309 insertions, 355 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 0d6a9f0f692..d6849767705 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -3123,10 +3123,9 @@ let isCodeGenOnly = 1 in {
}
// Patterns used to select SSE scalar fp arithmetic instructions from
-// a scalar fp operation followed by a blend.
+// either:
//
-// These patterns know, for example, how to select an ADDSS from a
-// float add plus vector insert.
+// (1) a scalar fp operation followed by a blend
//
// The effect is that the backend no longer emits unnecessary vector
// insert instructions immediately after SSE scalar fp instructions
@@ -3138,218 +3137,14 @@ let isCodeGenOnly = 1 in {
// return A;
// }
//
-// previously we generated:
+// Previously we generated:
// addss %xmm0, %xmm1
// movss %xmm1, %xmm0
//
-// we now generate:
+// We now generate:
// addss %xmm1, %xmm0
-
-let Predicates = [UseSSE1] in {
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))))),
- (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-}
-
-let Predicates = [UseSSE2] in {
- // SSE2 patterns to select scalar double-precision fp arithmetic instructions
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-let Predicates = [UseSSE41] in {
- // If the subtarget has SSE4.1 but not AVX, the vector insert instruction is
- // lowered into a X86insertps or a X86Blendi rather than a X86Movss. When
- // selecting SSE scalar single-precision fp arithmetic instructions, make
- // sure that we correctly match them.
-
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (ADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (SUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (MULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (DIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-let Predicates = [HasAVX] in {
- // The following patterns select AVX Scalar single/double precision fp
- // arithmetic instructions.
-
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))))),
- (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (iPTR 0))),
- (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fadd
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fsub
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fmul
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (fdiv
- (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fadd
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (VADDSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fsub
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (VSUBSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fmul
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (VMULSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
- def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector (fdiv
- (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
- (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-}
-
-// Patterns used to select SSE scalar fp arithmetic instructions from
-// a vector packed single/double fp operation followed by a vector insert.
+//
+// (2) a vector packed single/double fp operation followed by a vector insert
//
// The effect is that the backend converts the packed fp instruction
// followed by a vector insert into a single SSE scalar fp instruction.
@@ -3360,160 +3155,172 @@ let Predicates = [HasAVX] in {
// return (__m128) {c[0], a[1], a[2], a[3]};
// }
//
-// previously we generated:
+// Previously we generated:
// addps %xmm0, %xmm1
// movss %xmm1, %xmm0
//
-// we now generate:
+// We now generate:
// addss %xmm1, %xmm0
-let Predicates = [UseSSE1] in {
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-}
+// TODO: Some canonicalization in lowering would simplify the number of
+// patterns we have to try to match. In particular, the reversed order blends
+// seem unnecessary.
+multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE1] in {
+ // extracted scalar math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
+
+ // With SSE 4.1, insertps/blendi are preferred to movsd, so match those too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via insertps
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (iPTR 0))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
-let Predicates = [UseSSE2] in {
- // SSE2 patterns to select scalar double-precision fp arithmetic instructions
- // from a packed double-precision fp instruction plus movsd.
-
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-}
+ }
-let Predicates = [UseSSE41] in {
- // With SSE4.1 we may see these operations using X86Blendi rather than
- // X86Movs{s,d}.
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (ADDSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (SUBSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (MULSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (DIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
- def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (ADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (SUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (MULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (DIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+ // Repeat everything for AVX, except for the movss + scalar combo...
+ // because that one shouldn't occur with AVX codegen?
+ let Predicates = [HasAVX] in {
+ // extracted scalar math op with insert via insertps
+ def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (iPTR 0))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+ (Op (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
+ FR32:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
+ (COPY_TO_REGCLASS FR32:$src, VR128))>;
+
+ // vector math op with insert via movss
+ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
+ (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
+ }
}
-let Predicates = [HasAVX] in {
- // The following patterns select AVX Scalar single/double precision fp
- // arithmetic instructions from a packed single precision fp instruction
- // plus movss/movsd.
-
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
- (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
- (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
- (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
- (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
- // Also handle X86Blendi-based patterns.
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fadd (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (VADDSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fsub (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (VSUBSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fmul (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (VMULSSrr_Int v4f32:$dst, v4f32:$src)>;
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (fdiv (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (VDIVSSrr_Int v4f32:$dst, v4f32:$src)>;
-
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
-
- def : Pat<(v2f64 (X86Blendi (fadd (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (VADDSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fsub (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (VSUBSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fmul (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (VMULSDrr_Int v2f64:$dst, v2f64:$src)>;
- def : Pat<(v2f64 (X86Blendi (fdiv (v2f64 VR128:$dst), (v2f64 VR128:$src)),
- (v2f64 VR128:$dst), (i8 2))),
- (VDIVSDrr_Int v2f64:$dst, v2f64:$src)>;
+defm : scalar_math_f32_patterns<fadd, "ADD">;
+defm : scalar_math_f32_patterns<fsub, "SUB">;
+defm : scalar_math_f32_patterns<fmul, "MUL">;
+defm : scalar_math_f32_patterns<fdiv, "DIV">;
+
+multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
+ let Predicates = [UseSSE2] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // With SSE 4.1, blendi is preferred to movsd, so match those too.
+ let Predicates = [UseSSE41] in {
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend (reversed order)
+ def : Pat<(v2f64 (X86Blendi
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
+
+ // Repeat everything for AVX and add one more pattern
+ // (the scalar + blend reversed order) for good measure.
+ let Predicates = [HasAVX] in {
+ // extracted scalar math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
+ (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // extracted scalar math op with insert via blend (reversed order)
+ def : Pat<(v2f64 (X86Blendi (v2f64 (scalar_to_vector
+ (Op (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
+ FR64:$src))), (v2f64 VR128:$dst), (i8 2))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
+ (COPY_TO_REGCLASS FR64:$src, VR128))>;
+
+ // vector math op with insert via movsd
+ def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend
+ def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+
+ // vector math op with insert via blend (reversed order)
+ def : Pat<(v2f64 (X86Blendi
+ (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)),
+ (v2f64 VR128:$dst), (i8 2))),
+ (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
+ }
}
+defm : scalar_math_f64_patterns<fadd, "ADD">;
+defm : scalar_math_f64_patterns<fsub, "SUB">;
+defm : scalar_math_f64_patterns<fmul, "MUL">;
+defm : scalar_math_f64_patterns<fdiv, "DIV">;
+
+
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
/// represent the associated intrinsic operation. This form is unlike the
diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
index b122ef67544..8b1c6d0c882 100644
--- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -370,8 +370,155 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
ret <4 x float> %3
}
+; With SSE4.1 or greater, the shuffles in the following tests may
+; be lowered to X86Blendi nodes.
+
+define <4 x float> @blend_add_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_add_ss:
+; SSE: # BB#0:
+; SSE-NEXT: addss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_add_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <4 x float> %a, i32 0
+ %op = fadd float %b, %ext
+ %ins = insertelement <4 x float> undef, float %op, i32 0
+ %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_sub_ss:
+; SSE: # BB#0:
+; SSE-NEXT: subss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_sub_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <4 x float> %a, i32 0
+ %op = fsub float %ext, %b
+ %ins = insertelement <4 x float> undef, float %op, i32 0
+ %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_mul_ss:
+; SSE: # BB#0:
+; SSE-NEXT: mulss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_mul_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <4 x float> %a, i32 0
+ %op = fmul float %b, %ext
+ %ins = insertelement <4 x float> undef, float %op, i32 0
+ %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuf
+}
+
+define <4 x float> @blend_div_ss(<4 x float> %a, float %b) {
+; SSE-LABEL: blend_div_ss:
+; SSE: # BB#0:
+; SSE-NEXT: divss %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_div_ss:
+; AVX: # BB#0:
+; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <4 x float> %a, i32 0
+ %op = fdiv float %ext, %b
+ %ins = insertelement <4 x float> undef, float %op, i32 0
+ %shuf = shufflevector <4 x float> %ins, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+ ret <4 x float> %shuf
+}
+
+define <2 x double> @blend_add_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_add_sd:
+; SSE: # BB#0:
+; SSE-NEXT: addsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_add_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <2 x double> %a, i32 0
+ %op = fadd double %b, %ext
+ %ins = insertelement <2 x double> undef, double %op, i32 0
+ %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_sub_sd:
+; SSE: # BB#0:
+; SSE-NEXT: subsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_sub_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <2 x double> %a, i32 0
+ %op = fsub double %ext, %b
+ %ins = insertelement <2 x double> undef, double %op, i32 0
+ %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_mul_sd:
+; SSE: # BB#0:
+; SSE-NEXT: mulsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_mul_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <2 x double> %a, i32 0
+ %op = fmul double %b, %ext
+ %ins = insertelement <2 x double> undef, double %op, i32 0
+ %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuf
+}
+
+define <2 x double> @blend_div_sd(<2 x double> %a, double %b) {
+; SSE-LABEL: blend_div_sd:
+; SSE: # BB#0:
+; SSE-NEXT: divsd %xmm1, %xmm0
+; SSE-NEXT: retq
+;
+; AVX-LABEL: blend_div_sd:
+; AVX: # BB#0:
+; AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0
+; AVX-NEXT: retq
+
+ %ext = extractelement <2 x double> %a, i32 0
+ %op = fdiv double %ext, %b
+ %ins = insertelement <2 x double> undef, double %op, i32 0
+ %shuf = shufflevector <2 x double> %ins, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+ ret <2 x double> %shuf
+}
+
; Ensure that the backend selects SSE/AVX scalar fp instructions
-; from a packed fp instrution plus a vector insert.
+; from a packed fp instruction plus a vector insert.
define <4 x float> @insert_test_add_ss(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: insert_test_add_ss:
OpenPOWER on IntegriCloud