diff options
| author | Craig Topper <craig.topper@intel.com> | 2018-07-14 02:05:08 +0000 |
|---|---|---|
| committer | Craig Topper <craig.topper@intel.com> | 2018-07-14 02:05:08 +0000 |
| commit | f0b164415c383be9d20031d52cf5db9ae71cded8 (patch) | |
| tree | 26beee3fa7ce875eab5f2aa1d6b23cab85871f70 /llvm/lib | |
| parent | 70993d37e8dd2031b623071372d7aebefb18c74c (diff) | |
| download | bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.tar.gz bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.zip | |
[X86] Prefer blendi over movss/sd when avx512 is enabled unless optimizing for size.
AVX512 doesn't have an immediate controlled blend instruction. But blend throughput is still better than movss/sd on SKX.
This commit changes AVX512 to use the AVX blend instructions instead of MOVSS/MOVSD. This constrains the register allocation since it won't be able to use XMM16-31, but hopefully the increased throughput and reduced port 5 pressure makes up for that.
llvm-svn: 337083
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 23 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 4 |
2 files changed, 16 insertions, 11 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 3df4da13cd5..bfc26258ce4 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -3936,6 +3936,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}", multiclass avx512_move_scalar<string asm, SDNode OpNode, X86VectorVTInfo _> { + let Predicates = [HasAVX512, OptForSize] in def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -4324,7 +4325,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# (VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask, VR128X:$src1, VR128X:$src2), 0>; -let Predicates = [HasAVX512] in { +let Predicates = [HasAVX512, OptForSize] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), @@ -4339,6 +4340,17 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i32 0), (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>; + + def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; + def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), + (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), + (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; +} + +let Predicates = [HasAVX512] in { def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))), (SUBREG_TO_REG (i32 0), (VMOVSSZrr (v4f32 (AVX512_128_SET0)), @@ -4405,18 +4417,11 @@ let Predicates = [HasAVX512] in { (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; // Move low f64 and clear high bits. - def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))), (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2f64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>; - def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))), - (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), - (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>; def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))), (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)), (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>; @@ -4425,7 +4430,9 @@ let Predicates = [HasAVX512] in { def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))), addr:$dst), (VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>; +} +let Predicates = [HasAVX512, OptForSize] in { // Shuffle with VMOVSS def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>; diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 8f195847528..5c8b612f49b 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -6394,8 +6394,7 @@ let Predicates = [HasAVX2] in { // Prefer a movss or movsd over a blendps when optimizing for size. these were // changed to use blends because blends have better throughput on sandybridge // and haswell, but movs[s/d] are 1-2 byte shorter instructions. -let Predicates = [UseAVX] in { - let Predicates = [UseAVX, OptForSpeed] in { +let Predicates = [HasAVX, OptForSpeed] in { def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), @@ -6410,7 +6409,6 @@ let Predicates = [UseAVX] in { (VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>; def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), (VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>; - } // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), |

