summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-07-14 02:05:08 +0000
committerCraig Topper <craig.topper@intel.com>2018-07-14 02:05:08 +0000
commitf0b164415c383be9d20031d52cf5db9ae71cded8 (patch)
tree26beee3fa7ce875eab5f2aa1d6b23cab85871f70 /llvm/lib
parent70993d37e8dd2031b623071372d7aebefb18c74c (diff)
downloadbcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.tar.gz
bcm5719-llvm-f0b164415c383be9d20031d52cf5db9ae71cded8.zip
[X86] Prefer blendi over movss/sd when avx512 is enabled unless optimizing for size.
AVX512 doesn't have an immediate controlled blend instruction. But blend throughput is still better than movss/sd on SKX. This commit changes AVX512 to use the AVX blend instructions instead of MOVSS/MOVSD. This constrains the register allocation since it won't be able to use XMM16-31, but hopefully the increased throughput and reduced port 5 pressure makes up for that. llvm-svn: 337083
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td23
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td4
2 files changed, 16 insertions, 11 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 3df4da13cd5..bfc26258ce4 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3936,6 +3936,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
multiclass avx512_move_scalar<string asm, SDNode OpNode,
X86VectorVTInfo _> {
+ let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -4324,7 +4325,7 @@ def : InstAlias<"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
(VMOVSDZrrkz_REV VR128X:$dst, VK1WM:$mask,
VR128X:$src1, VR128X:$src2), 0>;
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512, OptForSize] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
(VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
@@ -4339,6 +4340,17 @@ let Predicates = [HasAVX512] in {
(SUBREG_TO_REG (i32 0),
(VMOVSSZrr (v4i32 (AVX512_128_SET0)),
(EXTRACT_SUBREG (v8i32 VR256X:$src), sub_xmm)), sub_xmm)>;
+
+ def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0),
+ (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+ (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
+ def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
+ (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
+ (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
def : Pat<(v16f32 (X86vzmovl (v16f32 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSSZrr (v4f32 (AVX512_128_SET0)),
@@ -4405,18 +4417,11 @@ let Predicates = [HasAVX512] in {
(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
// Move low f64 and clear high bits.
- def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v4f64 VR256X:$src), sub_xmm)), sub_xmm)>;
def : Pat<(v8f64 (X86vzmovl (v8f64 VR512:$src))),
(SUBREG_TO_REG (i32 0),
(VMOVSDZrr (v2f64 (AVX512_128_SET0)),
(EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm)), sub_xmm)>;
- def : Pat<(v4i64 (X86vzmovl (v4i64 VR256X:$src))),
- (SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
- (EXTRACT_SUBREG (v4i64 VR256X:$src), sub_xmm)), sub_xmm)>;
def : Pat<(v8i64 (X86vzmovl (v8i64 VR512:$src))),
(SUBREG_TO_REG (i32 0), (VMOVSDZrr (v2i64 (AVX512_128_SET0)),
(EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm)), sub_xmm)>;
@@ -4425,7 +4430,9 @@ let Predicates = [HasAVX512] in {
def : Pat<(store (f32 (extractelt (v4f32 VR128X:$src), (iPTR 0))),
addr:$dst),
(VMOVSSZmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128X:$src), FR32X))>;
+}
+let Predicates = [HasAVX512, OptForSize] in {
// Shuffle with VMOVSS
def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
(VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 8f195847528..5c8b612f49b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6394,8 +6394,7 @@ let Predicates = [HasAVX2] in {
// Prefer a movss or movsd over a blendps when optimizing for size. these were
// changed to use blends because blends have better throughput on sandybridge
// and haswell, but movs[s/d] are 1-2 byte shorter instructions.
-let Predicates = [UseAVX] in {
- let Predicates = [UseAVX, OptForSpeed] in {
+let Predicates = [HasAVX, OptForSpeed] in {
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
@@ -6410,7 +6409,6 @@ let Predicates = [UseAVX] in {
(VBLENDPDrri VR128:$src1, VR128:$src2, (i8 1))>;
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
(VPBLENDWrri VR128:$src1, VR128:$src2, (i8 0xf))>;
- }
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
OpenPOWER on IntegriCloud