diff options
author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-03-15 14:45:30 +0000 |
---|---|---|
committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-03-15 14:45:30 +0000 |
commit | fb7aa57bf14268e229b4eab57e105b691f1c48b3 (patch) | |
tree | 1164f88eeb366330d5fc8a6e5bd3c2785aaf83cf | |
parent | 9a54397f81c9882d9cd25bde666f4b0c50bf0654 (diff) | |
download | bcm5719-llvm-fb7aa57bf14268e229b4eab57e105b691f1c48b3.tar.gz bcm5719-llvm-fb7aa57bf14268e229b4eab57e105b691f1c48b3.zip |
[X86][SSE] Introduce Float/Vector WriteMove, WriteLoad and Writetore scheduler classes
As discussed on D44428 and PR36726, this patch splits off WriteFMove/WriteVecMove, WriteFLoad/WriteVecLoad and WriteFStore/WriteVecStore scheduler classes to permit vectors to be handled separately from gpr/scalar types.
I've minimised the diff here by only moving various basic SSE/AVX vector instructions across - we can fix the rest when called for. This does fix the MOVDQA vs MOVAPS/MOVAPD discrepancies mentioned on D44428.
Differential Revision: https://reviews.llvm.org/D44471
llvm-svn: 327630
-rw-r--r-- | llvm/lib/Target/X86/X86InstrSSE.td | 80 | ||||
-rwxr-xr-x | llvm/lib/Target/X86/X86SchedBroadwell.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeClient.td | 8 | ||||
-rwxr-xr-x | llvm/lib/Target/X86/X86SchedSkylakeServer.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 6 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 9 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 8 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver1.td | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx-schedule.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/avx2-schedule.ll | 10 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sha-schedule.ll | 12 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse-schedule.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse2-schedule.ll | 8 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse3-schedule.ll | 6 | ||||
-rw-r--r-- | llvm/test/CodeGen/X86/sse41-schedule.ll | 22 |
17 files changed, 148 insertions, 75 deletions
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td index 7c07cfa02bd..516504b712c 100644 --- a/llvm/lib/Target/X86/X86InstrSSE.td +++ b/llvm/lib/Target/X86/X86InstrSSE.td @@ -649,12 +649,12 @@ multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC, let hasSideEffects = 0 in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>, - Sched<[WriteFShuffle]>; + Sched<[WriteFMove]>; let canFoldAsLoad = 1, isReMaterializable = 1 in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>, - Sched<[WriteLoad]>; + Sched<[WriteFLoad]>; } let Predicates = [HasAVX, NoVLX] in { @@ -702,7 +702,7 @@ defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64, PD; } -let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in { +let SchedRW = [WriteFStore], Predicates = [HasAVX, NoVLX] in { def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -801,7 +801,7 @@ def : InstAlias<"vmovups\t{$src, $dst|$dst, $src}", def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}", (VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>; -let SchedRW = [WriteStore] in { +let SchedRW = [WriteFStore] in { def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movaps\t{$src, $dst|$dst, $src}", [(alignedstore (v4f32 VR128:$src), addr:$dst)], @@ -3385,8 +3385,8 @@ defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd, //===----------------------------------------------------------------------===// let AddedComplexity = 400 in { // Prefer non-temporal versions -let SchedRW = [WriteStore] in { let Predicates = [HasAVX, NoVLX] in { +let SchedRW = [WriteFStore] in { def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3400,14 +3400,6 @@ def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_WIG; -let ExeDomain = SSEPackedInt in -def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), - (ins i128mem:$dst, VR128:$src), - "movntdq\t{$src, $dst|$dst, $src}", - [(alignednontemporalstore (v2i64 VR128:$src), - addr:$dst)], - IIC_SSE_MOVNT>, VEX, VEX_WIG; - def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movntps\t{$src, $dst|$dst, $src}", @@ -3420,15 +3412,25 @@ def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs), [(alignednontemporalstore (v4f64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG; -let ExeDomain = SSEPackedInt in +} // SchedRW + +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { +def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs), + (ins i128mem:$dst, VR128:$src), + "movntdq\t{$src, $dst|$dst, $src}", + [(alignednontemporalstore (v2i64 VR128:$src), + addr:$dst)], + IIC_SSE_MOVNT>, VEX, VEX_WIG; def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4i64 VR256:$src), addr:$dst)], IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG; -} +} // ExeDomain, SchedRW +} // Predicates +let SchedRW = [WriteVecStore] in { def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntps\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)], @@ -3437,13 +3439,15 @@ def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntpd\t{$src, $dst|$dst, $src}", [(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)], IIC_SSE_MOVNT>; +} // SchedRW -let ExeDomain = SSEPackedInt in +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), "movntdq\t{$src, $dst|$dst, $src}", [(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVNT>; +let SchedRW = [WriteStore] in { // There is no AVX form for instructions below this point def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), "movnti{l}\t{$src, $dst|$dst, $src}", @@ -3560,7 +3564,7 @@ def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst), let ExeDomain = SSEPackedInt in { // SSE integer instructions -let hasSideEffects = 0, SchedRW = [WriteMove] in { +let hasSideEffects = 0, SchedRW = [WriteVecMove] in { def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG; @@ -3577,7 +3581,7 @@ def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), // For Disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, - SchedRW = [WriteMove] in { + SchedRW = [WriteVecMove] in { def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>, @@ -3597,7 +3601,7 @@ def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src), } let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - hasSideEffects = 0, SchedRW = [WriteLoad], Predicates = [HasAVX,NoVLX] in { + hasSideEffects = 0, SchedRW = [WriteVecLoad], Predicates = [HasAVX,NoVLX] in { def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (alignedloadv2i64 addr:$src))], @@ -3614,7 +3618,7 @@ def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), XS, VEX, VEX_L, VEX_WIG; } -let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore], +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore], Predicates = [HasAVX,NoVLX] in { def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), @@ -3634,7 +3638,7 @@ def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src), XS, VEX, VEX_L, VEX_WIG; } -let SchedRW = [WriteMove] in { +let SchedRW = [WriteVecMove] in { let hasSideEffects = 0 in { def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>; @@ -3658,7 +3662,7 @@ def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src), } // SchedRW let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1, - hasSideEffects = 0, SchedRW = [WriteLoad] in { + hasSideEffects = 0, SchedRW = [WriteVecLoad] in { def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/], @@ -3670,7 +3674,7 @@ def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), XS, Requires<[UseSSE2]>; } -let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in { +let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore] in { def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src), "movdqa\t{$src, $dst|$dst, $src}", [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/], @@ -4302,7 +4306,7 @@ def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), // SSE2 - Conditional Store //===---------------------------------------------------------------------===// -let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in { let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs), @@ -4826,7 +4830,7 @@ let Predicates = [UseSSE3] in { // SSE3 - Move Unaligned Integer //===---------------------------------------------------------------------===// -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteVecLoad] in { let Predicates = [HasAVX] in { def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vlddqu\t{$src, $dst|$dst, $src}", @@ -4835,12 +4839,12 @@ let Predicates = [HasAVX] in { "vlddqu\t{$src, $dst|$dst, $src}", [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX, VEX_L, VEX_WIG; -} +} // Predicates def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "lddqu\t{$src, $dst|$dst, $src}", [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))], IIC_SSE_LDDQU>; -} +} // SchedRW //===---------------------------------------------------------------------===// // SSE3 - Arithmetic @@ -6832,7 +6836,7 @@ let Predicates = [UseSSE41] in { } let AddedComplexity = 400 in { // Prefer non-temporal versions -let SchedRW = [WriteLoad] in { +let SchedRW = [WriteVecLoad] in { let Predicates = [HasAVX, NoVLX] in def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}", []>, @@ -7616,7 +7620,7 @@ let mayStore = 1 in def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs), (ins f128mem:$dst, VR256:$src1, u8imm:$src2), "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}", - []>, Sched<[WriteStore]>, VEX, VEX_L; + []>, Sched<[WriteFStore]>, VEX, VEX_L; } multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> { @@ -7653,22 +7657,22 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteFLoad]>; def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteFLoad]>; def mr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, Sched<[WriteStore]>; + VEX_4V, Sched<[WriteFStore]>; def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, VEX_L, Sched<[WriteStore]>; + VEX_4V, VEX_L, Sched<[WriteFStore]>; } let ExeDomain = SSEPackedSingle in @@ -8271,7 +8275,7 @@ let hasSideEffects = 0, mayStore = 1 in def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs), (ins i128mem:$dst, VR256:$src1, u8imm:$src2), "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - Sched<[WriteStore]>, VEX, VEX_L; + Sched<[WriteVecStore]>, VEX, VEX_L; let Predicates = [HasAVX2, NoVLX] in { defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>; @@ -8290,22 +8294,22 @@ multiclass avx2_pmovmask<string OpcodeStr, (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteVecLoad]>; def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))], - IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>; + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteVecLoad]>; def mr : AVX28I<0x8e, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, Sched<[WriteStore]>; + VEX_4V, Sched<[WriteVecStore]>; def Ymr : AVX28I<0x8e, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>, - VEX_4V, VEX_L, Sched<[WriteStore]>; + VEX_4V, VEX_L, Sched<[WriteVecStore]>; } defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 6441fd557d7..0d16e9a7cfd 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -128,6 +128,10 @@ def : InstRW<[WriteMove], (instrs COPY)>; defm : BWWriteResPair<WriteJump, BWPort06, 1>; // Floating point. This covers both scalar and vector operations. +def : WriteRes<WriteFLoad, [BWPort23]> { let Latency = 5; } +def : WriteRes<WriteFStore, [BWPort237, BWPort4]>; +def : WriteRes<WriteFMove, [BWPort5]>; + defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare. defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication. defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division. @@ -150,6 +154,10 @@ def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> { // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } // Vector integer operations. +def : WriteRes<WriteVecLoad, [BWPort23]> { let Latency = 5; } +def : WriteRes<WriteVecStore, [BWPort237, BWPort4]>; +def : WriteRes<WriteVecMove, [BWPort015]>; + defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals. defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts. defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index fbaf745badb..36f6faa073d 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -125,6 +125,10 @@ def : WriteRes<WriteIDivLd, [HWPort23, HWPort0, HWDivider]> { } // Scalar and vector floating point. +def : WriteRes<WriteFStore, [HWPort237, HWPort4]>; +def : WriteRes<WriteFLoad, [HWPort23]> { let Latency = 5; } +def : WriteRes<WriteFMove, [HWPort5]>; + defm : HWWriteResPair<WriteFAdd, HWPort1, 3>; defm : HWWriteResPair<WriteFMul, HWPort0, 5>; defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles. @@ -149,6 +153,10 @@ def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> { } // Vector integer operations. +def : WriteRes<WriteVecStore, [HWPort237, HWPort4]>; +def : WriteRes<WriteVecLoad, [HWPort23]> { let Latency = 5; } +def : WriteRes<WriteVecMove, [HWPort015]>; + defm : HWWriteResPair<WriteVecShift, HWPort0, 1>; defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>; defm : HWWriteResPair<WriteVecALU, HWPort15, 1>; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index bec8f6277cd..32d20d1c18c 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -114,6 +114,10 @@ def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { } // Scalar and vector floating point. +def : WriteRes<WriteFStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteFLoad, [SBPort23]> { let Latency = 6; } +def : WriteRes<WriteFMove, [SBPort5]>; + defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>; defm : SBWriteResPair<WriteFDiv, SBPort0, 24>; @@ -135,6 +139,10 @@ def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> { } // Vector integer operations. +def : WriteRes<WriteVecStore, [SBPort23, SBPort4]>; +def : WriteRes<WriteVecLoad, [SBPort23]> { let Latency = 6; } +def : WriteRes<WriteVecMove, [SBPort05]>; + defm : SBWriteResPair<WriteVecShift, SBPort5, 1>; defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>; defm : SBWriteResPair<WriteVecALU, SBPort1, 3>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index abad9cee2f7..daa57673b57 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -126,6 +126,10 @@ def : WriteRes<WriteZero, []>; defm : SKLWriteResPair<WriteJump, SKLPort06, 1>; // Floating point. This covers both scalar and vector operations. +def : WriteRes<WriteFLoad, [SKLPort23]> { let Latency = 6; } +def : WriteRes<WriteFStore, [SKLPort237, SKLPort4]>; +def : WriteRes<WriteFMove, [SKLPort015]>; + defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare. defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication. defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division. @@ -148,6 +152,10 @@ def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> { // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } // Vector integer operations. +def : WriteRes<WriteVecLoad, [SKLPort23]> { let Latency = 6; } +def : WriteRes<WriteVecStore, [SKLPort237, SKLPort4]>; +def : WriteRes<WriteVecMove, [SKLPort015]>; + defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals. defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts. defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply. diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index bc5ecc68a37..9ad1d025837 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -126,6 +126,10 @@ def : WriteRes<WriteZero, []>; defm : SKXWriteResPair<WriteJump, SKXPort06, 1>; // Floating point. This covers both scalar and vector operations. +def : WriteRes<WriteFLoad, [SKXPort23]> { let Latency = 5; } +def : WriteRes<WriteFStore, [SKXPort237, SKXPort4]>; +def : WriteRes<WriteFMove, [SKXPort015]>; + defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare. defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication. defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division. @@ -148,6 +152,10 @@ def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> { // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } // Vector integer operations. +def : WriteRes<WriteVecLoad, [SKXPort23]> { let Latency = 5; } +def : WriteRes<WriteVecStore, [SKXPort237, SKXPort4]>; +def : WriteRes<WriteVecMove, [SKXPort015]>; + defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals. defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts. defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply. diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 2e21a97541b..9a87b2d598e 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -63,6 +63,9 @@ def WriteZero : SchedWrite; defm WriteJump : X86SchedWritePair; // Floating point. This covers both scalar and vector operations. +def WriteFLoad : SchedWrite; +def WriteFStore : SchedWrite; +def WriteFMove : SchedWrite; defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare. defm WriteFMul : X86SchedWritePair; // Floating point multiplication. defm WriteFDiv : X86SchedWritePair; // Floating point division. @@ -82,6 +85,9 @@ defm WriteFHAdd : X86SchedWritePair; defm WritePHAdd : X86SchedWritePair; // Vector integer operations. +def WriteVecLoad : SchedWrite; +def WriteVecStore : SchedWrite; +def WriteVecMove : SchedWrite; defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecShift : X86SchedWritePair; // Vector integer shifts. defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply. diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 9fd1ec10506..f0272617619 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -214,7 +214,6 @@ def: InstRW<[JWriteSHLDm],(instrs SHLD16mri8, SHLD32mri8, SHLD64mri8, //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. -// FIXME: Split x86 and SSE load/store/moves //////////////////////////////////////////////////////////////////////////////// def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } @@ -255,6 +254,10 @@ def : WriteRes<WriteNop, []>; // FIXME: SS vs PS latencies //////////////////////////////////////////////////////////////////////////////// +def : WriteRes<WriteFLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteFStore, [JSAGU]>; +def : WriteRes<WriteFMove, [JFPU01]>; + defm : JWriteResFpuPair<WriteFAdd, [JFPU0], 3>; defm : JWriteResFpuPair<WriteFMul, [JFPU1], 2>; defm : JWriteResFpuPair<WriteFMA, [JFPU1], 2>; // NOTE: Doesn't exist on Jaguar. @@ -280,6 +283,10 @@ defm : JWriteResFpuPair<WriteCvtF2F, [JFPU1], 3>; // Float -> Float size c // Vector integer operations. //////////////////////////////////////////////////////////////////////////////// +def : WriteRes<WriteVecLoad, [JLAGU]> { let Latency = 5; } +def : WriteRes<WriteVecStore, [JSAGU]>; +def : WriteRes<WriteVecMove, [JFPU01]>; + defm : JWriteResFpuPair<WriteVecALU, [JFPU01], 1>; defm : JWriteResFpuPair<WriteVecShift, [JFPU01], 1>; defm : JWriteResFpuPair<WriteVecIMul, [JFPU0], 2>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 35ec7488db7..0292ce4af1f 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -101,6 +101,10 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> { } // Scalar and vector floating point. +def : WriteRes<WriteFStore, [FPC_RSV01, MEC_RSV]>; +def : WriteRes<WriteFLoad, [MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFMove, [FPC_RSV01]>; + defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>; defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>; defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>; @@ -131,6 +135,10 @@ def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> { } // Vector integer operations. +def : WriteRes<WriteVecStore, [FPC_RSV01, MEC_RSV]>; +def : WriteRes<WriteVecLoad, [MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecMove, [FPC_RSV01]>; + defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>; defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>; defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index ce85c78e6ff..1b1d1e85f91 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -167,6 +167,10 @@ def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> { } // Floating point operations +def : WriteRes<WriteFStore, [ZnAGU]>; +def : WriteRes<WriteFMove, [ZnFPU]>; +def : WriteRes<WriteFLoad, [ZnAGU]> { let Latency = 8; } + defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>; defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>; defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>; @@ -184,6 +188,10 @@ defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; // Vector integer operations which uses FPU units +def : WriteRes<WriteVecStore, [ZnAGU]>; +def : WriteRes<WriteVecMove, [ZnFPU]>; +def : WriteRes<WriteVecLoad, [ZnAGU]> { let Latency = 8; } + defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>; defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>; defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>; diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll index 6d25787c190..7f583a47efd 100644 --- a/llvm/test/CodeGen/X86/avx-schedule.ll +++ b/llvm/test/CodeGen/X86/avx-schedule.ll @@ -2103,7 +2103,7 @@ define <2 x double> @test_maskmovpd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) { ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:0.50] ; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:0.50] -; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1) call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2) @@ -2166,7 +2166,7 @@ define <4 x double> @test_maskmovpd_ymm(i8* %a0, <4 x i64> %a1, <4 x double> %a2 ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:1.00] ; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00] -; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1) call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2) @@ -2229,7 +2229,7 @@ define <4 x float> @test_maskmovps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) { ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:0.50] ; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:0.50] -; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1) call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2) @@ -2292,7 +2292,7 @@ define <8 x float> @test_maskmovps_ymm(i8* %a0, <8 x i32> %a1, <8 x float> %a2) ; ZNVER1: # %bb.0: ; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:1.00] ; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00] -; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1) call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2) diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index d06b6a78a8c..e7152f867e8 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -573,7 +573,7 @@ define <8 x i32> @test_inserti128(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) define <4 x i64> @test_movntdqa(i8* %a0) { ; GENERIC-LABEL: test_movntdqa: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [4:0.50] +; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [6:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntdqa: @@ -3380,7 +3380,7 @@ declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readn define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) { ; GENERIC-LABEL: test_pmaskmovd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3429,7 +3429,7 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) { ; GENERIC-LABEL: test_pmaskmovd_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3478,7 +3478,7 @@ declare void @llvm.x86.avx2.maskstore.d.256(i8*, <8 x i32>, <8 x i32>) nounwind define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) { ; GENERIC-LABEL: test_pmaskmovq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3527,7 +3527,7 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) { ; GENERIC-LABEL: test_pmaskmovq_ymm: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [4:0.50] +; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [6:0.50] ; GENERIC-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] diff --git a/llvm/test/CodeGen/X86/sha-schedule.ll b/llvm/test/CodeGen/X86/sha-schedule.ll index 138ff888b92..cecdc1c41b6 100644 --- a/llvm/test/CodeGen/X86/sha-schedule.ll +++ b/llvm/test/CodeGen/X86/sha-schedule.ll @@ -210,11 +210,11 @@ define <4 x i32> @test_sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, ; ; GOLDMONT-LABEL: test_sha256rnds2: ; GOLDMONT: # %bb.0: -; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] -; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50] +; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; GOLDMONT-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00] ; GOLDMONT-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [7:1.00] -; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] +; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50] ; GOLDMONT-NEXT: retq # sched: [4:1.00] ; ; CANNONLAKE-LABEL: test_sha256rnds2: @@ -228,11 +228,11 @@ define <4 x i32> @test_sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, ; ; ZNVER1-LABEL: test_sha256rnds2: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.50] -; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.25] +; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00] ; ZNVER1-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [11:1.00] -; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.50] +; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.25] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = load <4 x i32>, <4 x i32>* %a3 %2 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) diff --git a/llvm/test/CodeGen/X86/sse-schedule.ll b/llvm/test/CodeGen/X86/sse-schedule.ll index 8ebea98274d..3a123df699f 100644 --- a/llvm/test/CodeGen/X86/sse-schedule.ll +++ b/llvm/test/CodeGen/X86/sse-schedule.ll @@ -2557,7 +2557,7 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: rcpps (%rdi), %xmm1 # sched: [8:1.00] ; SLM-NEXT: rcpps %xmm0, %xmm0 # sched: [5:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_rcpps: @@ -2729,7 +2729,7 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [8:1.00] ; SLM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [5:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_rsqrtps: @@ -3038,7 +3038,7 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: sqrtps (%rdi), %xmm1 # sched: [18:1.00] ; SLM-NEXT: sqrtps %xmm0, %xmm0 # sched: [15:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_sqrtps: diff --git a/llvm/test/CodeGen/X86/sse2-schedule.ll b/llvm/test/CodeGen/X86/sse2-schedule.ll index a7c1c75226e..cb554e7c931 100644 --- a/llvm/test/CodeGen/X86/sse2-schedule.ll +++ b/llvm/test/CodeGen/X86/sse2-schedule.ll @@ -3564,7 +3564,7 @@ define <2 x double> @test_movsd_reg(<2 x double> %a0, <2 x double> %a1) { ; SLM-LABEL: test_movsd_reg: ; SLM: # %bb.0: ; SLM-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movsd_reg: @@ -8756,7 +8756,7 @@ define <2 x double> @test_sqrtpd(<2 x double> %a0, <2 x double> *%a1) { ; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00] ; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_sqrtpd: @@ -9284,10 +9284,10 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; SLM-LABEL: test_unpcklpd: ; SLM: # %bb.0: ; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50] ; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_unpcklpd: diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll index bb7694c1e4a..1b1e765ab27 100644 --- a/llvm/test/CodeGen/X86/sse3-schedule.ll +++ b/llvm/test/CodeGen/X86/sse3-schedule.ll @@ -566,7 +566,7 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) { ; SLM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] sched: [4:1.00] ; SLM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00] ; SLM-NEXT: subpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movddup: @@ -645,7 +645,7 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:1.00] ; SLM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movshdup: @@ -724,7 +724,7 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:1.00] ; SLM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_movsldup: diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll index 60fc0e8e514..17188884983 100644 --- a/llvm/test/CodeGen/X86/sse41-schedule.ll +++ b/llvm/test/CodeGen/X86/sse41-schedule.ll @@ -163,11 +163,11 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub ; ; SLM-LABEL: test_blendvpd: ; SLM: # %bb.0: -; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:0.50] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00] -; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_blendvpd: @@ -230,11 +230,11 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> ; ; SLM-LABEL: test_blendvps: ; SLM: # %bb.0: -; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00] -; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_blendvps: @@ -717,7 +717,7 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16 ; SLM-LABEL: test_pblendvb: ; SLM: # %bb.0: ; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50] -; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50] ; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00] ; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00] ; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50] @@ -2991,7 +2991,7 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) { ; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00] ; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00] ; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_roundpd: @@ -3063,7 +3063,7 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) { ; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00] ; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00] ; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_roundps: @@ -3133,7 +3133,7 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; ; SLM-LABEL: test_roundsd: ; SLM: # %bb.0: -; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50] ; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00] ; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00] ; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] @@ -3206,7 +3206,7 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> * ; ; SLM-LABEL: test_roundss: ; SLM: # %bb.0: -; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00] +; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:0.50] ; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00] ; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00] ; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00] |