diff options
| author | Craig Topper <craig.topper@intel.com> | 2019-06-21 17:24:21 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2019-06-21 17:24:21 +0000 |
| commit | 6af1be96641f34e10bf3b4866f72571b63fab27c (patch) | |
| tree | 5316df5a0d97530c01855f9f7d587b6109a68927 /llvm/lib | |
| parent | 4c9def4a51ac10d9a249f31ea712c32474e89914 (diff) | |
| download | bcm5719-llvm-6af1be96641f34e10bf3b4866f72571b63fab27c.tar.gz bcm5719-llvm-6af1be96641f34e10bf3b4866f72571b63fab27c.zip | |
[X86] Use vmovq for v4i64/v4f64/v8i64/v8f64 vzmovl.
We already use vmovq for v2i64/v2f64 vzmovl. But we were using a
blendpd+xorpd for v4i64/v4f64/v8i64/v8f64 under opt speed. Or
movsd+xorpd under optsize.
I think the blend with 0 or movss/d is only needed for
vXi32 where we don't have an instruction that can move 32
bits from one xmm to another while zeroing upper bits.
movq is no worse than blendpd on any known CPUs.
llvm-svn: 364079
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 53 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 35 |
2 files changed, 35 insertions, 53 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 9f4a75c6689..8315b867316 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -4286,15 +4286,6 @@ let Predicates = [HasAVX512, OptForSize] in { (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), sub_xmm)>; - def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4303,17 +4294,6 @@ let Predicates = [HasAVX512, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), sub_xmm)>; - } // Use 128-bit blends for OptForSpeed since BLENDs have better throughput than @@ -4329,17 +4309,6 @@ let Predicates = [HasAVX512, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } let Predicates = [HasAVX512] in { @@ -4452,6 +4421,28 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>; def : Pat<(v8i64 (X86vzload addr:$src)), (SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)))), + sub_xmm)>; + + def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIZrr + (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIZrr + (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)))), + sub_xmm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index c96bac6828f..e25d2dca404 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -312,17 +312,6 @@ let Predicates = [UseAVX, OptForSize] in { (SUBREG_TO_REG (i32 0), (v4i32 (VMOVSSrr (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VMOVSDrr (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), - sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VMOVSDrr (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), - sub_xmm)>; } let Predicates = [UseSSE1] in { @@ -4307,6 +4296,19 @@ let Predicates = [UseSSE2] in { (MOVZPQILo2PQIrr VR128:$src)>; } +let Predicates = [UseAVX] in { + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2f64 (VMOVZPQILo2PQIrr + (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)))), + sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), + (SUBREG_TO_REG (i32 0), + (v2i64 (VMOVZPQILo2PQIrr + (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)))), + sub_xmm)>; +} + //===---------------------------------------------------------------------===// // SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// @@ -6319,17 +6321,6 @@ let Predicates = [HasAVX, OptForSpeed] in { (v4i32 (VPBLENDWrri (v4i32 (V_SET0)), (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm)), (i8 3))), sub_xmm)>; - - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2f64 (VBLENDPDrri (v2f64 (V_SET0)), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm)), - (i8 1))), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))), - (SUBREG_TO_REG (i32 0), - (v2i64 (VPBLENDWrri (v2i64 (V_SET0)), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm)), - (i8 0xf))), sub_xmm)>; } // Prefer a movss or movsd over a blendps when optimizing for size. these were |

