summaryrefslogtreecommitdiffstats
path: root/llvm/lib
diff options
context:
space:
mode:
authorCraig Topper <craig.topper@intel.com>2018-05-10 21:49:16 +0000
committerCraig Topper <craig.topper@intel.com>2018-05-10 21:49:16 +0000
commit1ee19ae126291314d047c0032a04549f222dbc96 (patch)
treeb52a7999039d480fb5b7a529899d8b8975fdbc32 /llvm/lib
parent2903a9bb0201fdb56da3e102bb5de917f16583b0 (diff)
downloadbcm5719-llvm-1ee19ae126291314d047c0032a04549f222dbc96.tar.gz
bcm5719-llvm-1ee19ae126291314d047c0032a04549f222dbc96.zip
[X86] Add new patterns for masked scalar load/store to match clang's codegen from r331958.
Clang's codegen now uses 128-bit masked load/store intrinsics in IR. The backend will widen to 512-bits on AVX512F targets. So this patch adds patterns to detect codegen's widening and patterns for AVX512VL that don't get widened. We may be able to drop some of the old patterns, but I leave that for a future patch. llvm-svn: 332049
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td117
1 files changed, 117 insertions, 0 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index cf2e33c0303..d078852c7e8 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -3876,6 +3876,31 @@ def : Pat<(masked_store addr:$dst, Mask,
}
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked store directly. Codegen will widen 128-bit masked store to 512
+// bits on AVX512F only targets.
+multiclass avx512_store_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+
+// AVX512F pattern.
+def : Pat<(masked_store addr:$dst, Mask512,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT _.info128.RC:$src),
+ (iPTR 0)))),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+// AVX512VL pattern.
+def : Pat<(masked_store addr:$dst, Mask128, (_.info128.VT _.info128.RC:$src)),
+ (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+}
+
multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
dag Mask, RegisterClass MaskRC> {
@@ -3926,6 +3951,48 @@ def : Pat<(_.info128.VT (extract_subvector
}
+// This matches the more recent codegen from clang that avoids emitting a 512
+// bit masked load directly. Codegen will widen 128-bit masked load to 512
+// bits on AVX512F only targets.
+multiclass avx512_load_scalar_lowering_subreg2<string InstrStr,
+ AVX512VLVectorVTInfo _,
+ dag Mask512, dag Mask128,
+ RegisterClass MaskRC,
+ SubRegIndex subreg> {
+// AVX512F patterns.
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ (_.info512.VT (bitconvert
+ (v16i32 immAllZerosV))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+ (_.info512.VT (masked_load addr:$srcAddr, Mask512,
+ (_.info512.VT (insert_subvector undef,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+ (iPTR 0))))),
+ (iPTR 0))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+// AVX512Vl patterns.
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ (_.info128.VT (bitconvert (v4i32 immAllZerosV))))),
+ (!cast<Instruction>(InstrStr#rmkz)
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (masked_load addr:$srcAddr, Mask128,
+ (_.info128.VT (X86vzmovl _.info128.RC:$src)))),
+ (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM),
+ addr:$srcAddr)>;
+}
+
defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
@@ -3936,6 +4003,31 @@ defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_store_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
+
defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
(v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
@@ -3943,6 +4035,31 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSSZ", avx512vl_f32_info,
+ (v16i1 (insert_subvector
+ (v16i1 immAllZerosV),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v4i1 (extract_subvector
+ (v8i1 (bitconvert (and GR8:$mask, (i8 1)))),
+ (iPTR 0))), GR8, sub_8bit>;
+defm : avx512_load_scalar_lowering_subreg2<"VMOVSDZ", avx512vl_f64_info,
+ (v8i1
+ (extract_subvector
+ (v16i1
+ (insert_subvector
+ (v16i1 immAllZerosV),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (iPTR 0))),
+ (v2i1 (extract_subvector
+ (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))),
+ (iPTR 0))), GR8, sub_8bit>;
+
def : Pat<(f32 (X86selects (scalar_to_vector GR8:$mask),
(f32 FR32X:$src1), (f32 FR32X:$src2))),
(COPY_TO_REGCLASS
OpenPOWER on IntegriCloud