diff options
Diffstat (limited to 'llvm/lib')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 38 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrFMA.td | 74 | ||||
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedBroadwell.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeClient.td | 2 | ||||
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedSkylakeServer.td | 2 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 1 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 3 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver1.td | 1 |
10 files changed, 75 insertions, 52 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 1f2e7197ba7..c30faed2387 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -5790,13 +5790,13 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -5804,7 +5804,7 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, - AVX512FMA3Base, EVEX_B; + AVX512FMA3Base, EVEX_B, Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5815,7 +5815,7 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5857,13 +5857,13 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1, vselect, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -5871,7 +5871,8 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B; + _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5883,7 +5884,7 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5924,7 +5925,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -5932,7 +5933,7 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>, - AVX512FMA3Base; + AVX512FMA3Base, Sched<[WriteFMA, ReadAfterLd]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -5941,7 +5942,8 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B; + _.RC:$src1, _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMA, ReadAfterLd]>; } } @@ -5953,7 +5955,7 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1, vselect, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -5994,28 +5996,30 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, let Constraints = "$src1 = $dst", hasSideEffects = 0 in { defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base, + Sched<[WriteFMA]>; defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base, + Sched<[WriteFMA, ReadAfterLd]>; defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA, ReadAfterLd]>; let isCodeGenOnly = 1, isCommutable = 1 in { def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - !if(MaskOnlyReg, [], [RHS_r])>; + !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>; def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [RHS_m]>; + [RHS_m]>, Sched<[WriteFMA, ReadAfterLd]>; }// isCodeGenOnly = 1 }// Constraints = "$src1 = $dst" } diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td index dd6a61ddc3b..1b706674a4d 100644 --- a/llvm/lib/Target/X86/X86InstrFMA.td +++ b/llvm/lib/Target/X86/X86InstrFMA.td @@ -41,7 +41,8 @@ multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>; + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>, + Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), @@ -49,7 +50,8 @@ multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, - (MemFrag addr:$src3))))]>; + (MemFrag addr:$src3))))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -60,7 +62,7 @@ multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), @@ -68,7 +70,7 @@ multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3), - RC:$src1)))]>; + RC:$src1)))]>, Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, @@ -79,7 +81,7 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -89,7 +91,7 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1, - RC:$src2)))]>; + RC:$src2)))]>, Sched<[WriteFMA, ReadAfterLd]>; } let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in @@ -172,7 +174,8 @@ multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>, + Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), @@ -180,7 +183,8 @@ multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr, @@ -191,7 +195,7 @@ multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), @@ -199,7 +203,8 @@ multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>; + (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>, + Sched<[WriteFMA, ReadAfterLd]>; } multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, @@ -210,7 +215,7 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; // Pattern is 312 order so that the load is in a different place from the // 213 and 231 patterns this helps tablegen's duplicate pattern detection. @@ -220,7 +225,8 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), [(set RC:$dst, - (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>; + (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>, + Sched<[WriteFMA, ReadAfterLd]>; } let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in @@ -257,14 +263,14 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA]>; let mayLoad = 1 in def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, RC:$src2, memopr:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + []>, Sched<[WriteFMA, ReadAfterLd]>; } // The FMA 213 form is created for lowering of scalar FMA intrinscis @@ -360,26 +366,29 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG; + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA]>; def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG; + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG; + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_LIG, FoldGenData<NAME#rr>; + VEX_LIG, FoldGenData<NAME#rr>, Sched<[WriteFMA]>; } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, @@ -391,26 +400,27 @@ let isCodeGenOnly = 1 in { "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W, - VEX_LIG; + VEX_LIG, Sched<[WriteFMA]>; def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, - mem_cpat:$src3)))]>, VEX_W, VEX_LIG; + mem_cpat:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA, ReadAfterLd]>; def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>, - VEX_LIG; + VEX_LIG, Sched<[WriteFMA, ReadAfterLd]>; let hasSideEffects = 0 in def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, VEX_LIG, FoldGenData<NAME#rr_Int>; + []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[WriteFMA]>; } // isCodeGenOnly = 1 } @@ -424,19 +434,21 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, - VEX_W; + VEX_W, Sched<[WriteFMA]>; def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, - (ld_frag128 addr:$src3)))]>, VEX_W; + (ld_frag128 addr:$src3)))]>, VEX_W, + Sched<[WriteFMA, ReadAfterLd]>; def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>, + Sched<[WriteFMA, ReadAfterLd]>; let isCommutable = 1 in def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), @@ -444,31 +456,33 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, - VEX_W, VEX_L; + VEX_W, VEX_L, Sched<[WriteFMA]>; def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, - (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L; + (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L, + Sched<[WriteFMA, ReadAfterLd]>; def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, - (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L; + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L, + Sched<[WriteFMA, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - FoldGenData<NAME#rr>; + Sched<[WriteFMA]>, FoldGenData<NAME#rr>; def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_L, FoldGenData<NAME#Yrr>; + VEX_L, Sched<[WriteFMA]>, FoldGenData<NAME#Yrr>; } // isCodeGenOnly = 1 } diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index c70af22d060..7fef01c72aa 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -131,7 +131,7 @@ defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating po defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root. defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate. defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add. defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles. defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends. def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index c2b188b8846..5b9223432df 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -134,6 +134,7 @@ defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; +defm : HWWriteResPair<WriteFMA, HWPort01, 5>; defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>; defm : HWWriteResPair<WriteFBlend, HWPort015, 1>; defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index c86c48ce06b..c6c60bf03b2 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -276,11 +276,12 @@ def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } def : WriteRes<WriteFence, [SBPort23, SBPort4]>; def : WriteRes<WriteNop, []>; -// AVX2 is not supported on that architecture, but we should define the basic +// AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>; +defm : SBWriteResPair<WriteFMA, SBPort01, 5>; // Remaining SNB instrs. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index aabb45be87c..eeeffdf7008 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -132,7 +132,7 @@ defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root. defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate. defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add. defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles. defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends. def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 50f6379bad8..8ba1ac027ce 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -132,7 +132,7 @@ defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root. defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate. defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate. -// defm WriteFMA : X86SchedWritePair; // Fused Multiply Add. +defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add. defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles. defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends. def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index edfb58059e1..a2f02962444 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -192,6 +192,7 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>; defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFMA, JFPU1, 2>; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>; defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 03ed2db2350..6a2a998b5ff 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -249,7 +249,7 @@ def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; } def : WriteRes<WriteFence, [MEC_RSV]>; def : WriteRes<WriteNop, []>; -// AVX is not supported on that architecture, but we should define the basic +// AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes<WriteIMulH, [FPC_RSV0]>; defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>; @@ -257,4 +257,5 @@ defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>; defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>; defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>; defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>; } // SchedModel diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index c72880b7f65..5ebe8a28422 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -175,6 +175,7 @@ defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>; defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>; defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>; defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>; +defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>; defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>; defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; |

