diff options
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 23 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrInfo.cpp | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 29 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/fold-load-binops.ll | 6 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll | 48 |
5 files changed, 63 insertions, 44 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index f428b201adc..6ceb5517863 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -11871,6 +11871,12 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo _.FRC:$src)))), (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst, (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>; + def : Pat<(MoveNode + (_.VT VR128X:$dst), + (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + (_.ScalarLdFrag addr:$src))))), + (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -11884,6 +11890,16 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), + _.FRC:$src0))), + (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk) + (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)), + VK1WM:$mask, _.VT:$src1, addr:$src2)>; // extracted masked scalar math op with insert via movss def : Pat<(MoveNode (_.VT VR128X:$src1), @@ -11895,6 +11911,13 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo (!cast<I>("V"#OpcPrefix#Zrr_Intkz) VK1WM:$mask, _.VT:$src1, (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>; + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT + (extractelt (_.VT VR128X:$src1), (iPTR 0))), + (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))), + (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>; } } diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 4d1791d6728..2fe438e3def 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4685,6 +4685,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; if (Size < RCSize) { + // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int. // Check if it's safe to fold the load. If the size of the object is // narrower than the load width, then it's not. if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4) diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 65849373e91..7d93d1bd985 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -2692,7 +2692,8 @@ let isCodeGenOnly = 1 in { // patterns we have to try to match. multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, ValueType VT, ValueType EltTy, - RegisterClass RC, Predicate BasePredicate> { + RegisterClass RC, PatFrag ld_frag, + Predicate BasePredicate> { let Predicates = [BasePredicate] in { // extracted scalar math op with insert via movss/movsd def : Pat<(VT (Move (VT VR128:$dst), @@ -2701,6 +2702,11 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, RC:$src))))), (!cast<Instruction>(OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast<Instruction>(OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } // Repeat for AVX versions of the instructions. @@ -2712,18 +2718,23 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move, RC:$src))))), (!cast<Instruction>("V"#OpcPrefix#rr_Int) VT:$dst, (VT (COPY_TO_REGCLASS RC:$src, VR128)))>; + def : Pat<(VT (Move (VT VR128:$dst), + (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + (ld_frag addr:$src)))))), + (!cast<Instruction>("V"#OpcPrefix#rm_Int) VT:$dst, addr:$src)>; } } -defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>; -defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>; +defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; +defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>; -defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; -defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>; +defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; +defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>; /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to diff --git a/llvm/test/CodeGen/X86/fold-load-binops.ll b/llvm/test/CodeGen/X86/fold-load-binops.ll index 2d4fc723baa..11732db550b 100644 --- a/llvm/test/CodeGen/X86/fold-load-binops.ll +++ b/llvm/test/CodeGen/X86/fold-load-binops.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 +; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE +; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 ; Verify that we're folding the load into the math instruction. ; This pattern is generated out of the simplest intrinsics usage: diff --git a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll index 1a5e06539e7..bd037d7079b 100644 --- a/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll +++ b/llvm/test/CodeGen/X86/sse-scalar-fp-arith.ll @@ -414,14 +414,12 @@ define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) { define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { ; X86-SSE-LABEL: blend_add_ss: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: addss %xmm1, %xmm0 +; X86-SSE-NEXT: addss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_add_ss: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_add_ss: @@ -444,14 +442,12 @@ define <4 x float> @blend_add_ss(<4 x float> %a, float %b) { define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) { ; X86-SSE-LABEL: blend_sub_ss: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: subss %xmm1, %xmm0 +; X86-SSE-NEXT: subss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_sub_ss: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_sub_ss: @@ -474,14 +470,12 @@ define <4 x float> @blend_sub_ss(<4 x float> %a, float %b) { define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) { ; X86-SSE-LABEL: blend_mul_ss: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: mulss %xmm1, %xmm0 +; X86-SSE-NEXT: mulss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_mul_ss: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vmulss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_mul_ss: @@ -504,14 +498,12 @@ define <4 x float> @blend_mul_ss(<4 x float> %a, float %b) { define <4 x float> @blend_div_ss(<4 x float> %a, float %b) { ; X86-SSE-LABEL: blend_div_ss: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-SSE-NEXT: divss %xmm1, %xmm0 +; X86-SSE-NEXT: divss {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_div_ss: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X86-AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vdivss {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_div_ss: @@ -534,14 +526,12 @@ define <4 x float> @blend_div_ss(<4 x float> %a, float %b) { define <2 x double> @blend_add_sd(<2 x double> %a, double %b) { ; X86-SSE-LABEL: blend_add_sd: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: addsd %xmm1, %xmm0 +; X86-SSE-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_add_sd: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vaddsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vaddsd {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_add_sd: @@ -564,14 +554,12 @@ define <2 x double> @blend_add_sd(<2 x double> %a, double %b) { define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) { ; X86-SSE-LABEL: blend_sub_sd: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: subsd %xmm1, %xmm0 +; X86-SSE-NEXT: subsd {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_sub_sd: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vsubsd {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_sub_sd: @@ -594,14 +582,12 @@ define <2 x double> @blend_sub_sd(<2 x double> %a, double %b) { define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) { ; X86-SSE-LABEL: blend_mul_sd: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: mulsd %xmm1, %xmm0 +; X86-SSE-NEXT: mulsd {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_mul_sd: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vmulsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vmulsd {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_mul_sd: @@ -624,14 +610,12 @@ define <2 x double> @blend_mul_sd(<2 x double> %a, double %b) { define <2 x double> @blend_div_sd(<2 x double> %a, double %b) { ; X86-SSE-LABEL: blend_div_sd: ; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X86-SSE-NEXT: divsd %xmm1, %xmm0 +; X86-SSE-NEXT: divsd {{[0-9]+}}(%esp), %xmm0 ; X86-SSE-NEXT: retl ; ; X86-AVX-LABEL: blend_div_sd: ; X86-AVX: # %bb.0: -; X86-AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X86-AVX-NEXT: vdivsd %xmm1, %xmm0, %xmm0 +; X86-AVX-NEXT: vdivsd {{[0-9]+}}(%esp), %xmm0, %xmm0 ; X86-AVX-NEXT: retl ; ; X64-SSE-LABEL: blend_div_sd: |

