diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-05-10 17:06:09 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-05-10 17:06:09 +0000 |
| commit | 38ac0e9c6b1ed09bae3ebb11414fa9908a6b10a3 (patch) | |
| tree | f82bc003c0ebb2c067cb2b7e9832bc88911b4748 /llvm | |
| parent | dc4bb3fd3607609cba3598092acbc86f9558fd75 (diff) | |
| download | bcm5719-llvm-38ac0e9c6b1ed09bae3ebb11414fa9908a6b10a3.tar.gz bcm5719-llvm-38ac0e9c6b1ed09bae3ebb11414fa9908a6b10a3.zip | |
[X86] Split WriteVecALU/WriteVecLogic/WriteShuffle/WriteVarShuffle/WritePSADBW/WritePHAdd scheduler classes
Split off XMM classes from the default (MMX) classes.
llvm-svn: 331999
Diffstat (limited to 'llvm')
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrAVX512.td | 10 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86InstrXOP.td | 4 | ||||
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedBroadwell.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 80 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 119 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeClient.td | 90 | ||||
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedSkylakeServer.td | 103 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 26 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleAtom.td | 20 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 6 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver1.td | 7 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/3dnow-schedule.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx2-schedule.ll | 24 | ||||
| -rwxr-xr-x | llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll | 96 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/xop-schedule.ll | 6 |
16 files changed, 201 insertions, 406 deletions
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td index 958ec92e38c..cf2e33c0303 100644 --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -1333,7 +1333,7 @@ multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, avx512_int_broadcast_rm_lowering<_.info256, _.info256>, EVEX_V256; defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle, - WriteShuffleLd, _.info128, _.info128>, + WriteShuffleXLd, _.info128, _.info128>, EVEX_V128; } } @@ -1353,7 +1353,8 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", (_Dst.VT (X86SubVBroadcast (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, - AVX5128IBase, EVEX, Sched<[WriteShuffleLd]>; + Sched<[SchedWriteShuffle.YMM.Folded]>, + AVX5128IBase, EVEX; } // This should be used for the AVX512DQ broadcast instructions. It disables @@ -1367,7 +1368,8 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, (null_frag), (_Dst.VT (X86SubVBroadcast (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, - AVX5128IBase, EVEX, Sched<[WriteShuffleLd]>; + Sched<[SchedWriteShuffle.YMM.Folded]>, + AVX5128IBase, EVEX; } let Predicates = [HasAVX512] in { @@ -1646,7 +1648,7 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, let Predicates = [HasDQI, HasVLX] in defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle, - WriteShuffleLd, _Dst.info128, + WriteShuffleXLd, _Dst.info128, _Src.info128, _Src.info128, null_frag>, EVEX_V128; } diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td index a2f9b142603..47b2f5d5e9b 100644 --- a/llvm/lib/Target/X86/X86InstrXOP.td +++ b/llvm/lib/Target/X86/X86InstrXOP.td @@ -14,11 +14,11 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WritePHAdd]>; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[SchedWritePHAdd.XMM]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, - Sched<[WritePHAdd.Folded, ReadAfterLd]>; + Sched<[SchedWritePHAdd.XMM.Folded, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 764decf2d8a..fcda18860e2 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -265,8 +265,10 @@ defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>; defm : BWWriteResPair<WriteVecALU, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : BWWriteResPair<WriteVecALUX, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. defm : BWWriteResPair<WriteVecALUY, [BWPort15], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (YMM/ZMM). defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : BWWriteResPair<WriteVecLogicX,[BWPort015], 1, [1], 1, 5>; // Vector integer and/or/xor. defm : BWWriteResPair<WriteVecLogicY,[BWPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (YMM/ZMM). defm : BWWriteResPair<WriteVecTest, [BWPort0,BWPort5], 2, [1,1], 2, 5>; // Vector integer TEST instructions. defm : BWWriteResPair<WriteVecTestY, [BWPort0,BWPort5], 4, [1,1], 2, 6>; // Vector integer TEST instructions (YMM/ZMM). @@ -276,8 +278,10 @@ defm : BWWriteResPair<WriteVecIMulY, [BWPort0], 5, [1], 1, 6>; // Vector intege defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // Vector PMULLD. defm : BWWriteResPair<WritePMULLDY, [BWPort0], 10, [2], 2, 6>; // Vector PMULLD (YMM/ZMM). defm : BWWriteResPair<WriteShuffle, [BWPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : BWWriteResPair<WriteShuffleX, [BWPort5], 1, [1], 1, 5>; // Vector shuffles. defm : BWWriteResPair<WriteShuffleY, [BWPort5], 1, [1], 1, 6>; // Vector shuffles (YMM/ZMM). defm : BWWriteResPair<WriteVarShuffle, [BWPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : BWWriteResPair<WriteVarShuffleX,[BWPort5], 1, [1], 1, 5>; // Vector variable shuffles. defm : BWWriteResPair<WriteVarShuffleY,[BWPort5], 1, [1], 1, 6>; // Vector variable shuffles (YMM/ZMM). defm : BWWriteResPair<WriteBlend, [BWPort5], 1, [1], 1, 5>; // Vector blends. defm : BWWriteResPair<WriteBlendY, [BWPort5], 1, [1], 1, 6>; // Vector blends (YMM/ZMM). @@ -286,6 +290,7 @@ defm : BWWriteResPair<WriteVarBlendY, [BWPort5], 2, [2], 2, 6>; // Vector variab defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 7, [1, 2], 3, 5>; // Vector MPSAD. defm : BWWriteResPair<WriteMPSADY, [BWPort0, BWPort5], 7, [1, 2], 3, 6>; // Vector MPSAD. defm : BWWriteResPair<WritePSADBW, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW. +defm : BWWriteResPair<WritePSADBWX, [BWPort0], 5, [1], 1, 5>; // Vector PSADBW. defm : BWWriteResPair<WritePSADBWY, [BWPort0], 5, [1], 1, 6>; // Vector PSADBW (YMM/ZMM). defm : BWWriteResPair<WritePHMINPOS, [BWPort0], 5>; // Vector PHMINPOS. @@ -448,6 +453,7 @@ def : WriteRes<WriteNop, []>; defm : BWWriteResPair<WriteFHAdd, [BWPort1,BWPort5], 5, [1,2], 3, 5>; defm : BWWriteResPair<WriteFHAddY, [BWPort1,BWPort5], 5, [1,2], 3, 6>; defm : BWWriteResPair<WritePHAdd, [BWPort5,BWPort15], 3, [2,1], 3, 5>; +defm : BWWriteResPair<WritePHAddX, [BWPort5,BWPort15], 3, [2,1], 3, 5>; defm : BWWriteResPair<WritePHAddY, [BWPort5,BWPort15], 3, [2,1], 3, 6>; // Remaining instrs. diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 1e431e078cd..1dda56a7d65 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -255,11 +255,13 @@ defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5 defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>; -defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>; defm : HWWriteResPair<WriteVecLogicY,[HWPort015], 1, [1], 1, 7>; defm : HWWriteResPair<WriteVecTest, [HWPort0,HWPort5], 2, [1,1], 2, 6>; defm : HWWriteResPair<WriteVecTestY, [HWPort0,HWPort5], 4, [1,1], 2, 7>; -defm : HWWriteResPair<WriteVecALU, [HWPort15], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVecALU, [HWPort15], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVecALUX, [HWPort15], 1, [1], 1, 6>; defm : HWWriteResPair<WriteVecALUY, [HWPort15], 1, [1], 1, 7>; defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5, [1], 1, 5>; defm : HWWriteResPair<WriteVecIMulX, [HWPort0], 5, [1], 1, 6>; @@ -267,8 +269,10 @@ defm : HWWriteResPair<WriteVecIMulY, [HWPort0], 5, [1], 1, 7>; defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>; defm : HWWriteResPair<WritePMULLDY, [HWPort0], 10, [2], 2, 7>; defm : HWWriteResPair<WriteShuffle, [HWPort5], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteShuffleX, [HWPort5], 1, [1], 1, 6>; defm : HWWriteResPair<WriteShuffleY, [HWPort5], 1, [1], 1, 7>; -defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 6>; +defm : HWWriteResPair<WriteVarShuffle, [HWPort5], 1, [1], 1, 5>; +defm : HWWriteResPair<WriteVarShuffleX,[HWPort5], 1, [1], 1, 6>; defm : HWWriteResPair<WriteVarShuffleY,[HWPort5], 1, [1], 1, 7>; defm : HWWriteResPair<WriteBlend, [HWPort5], 1, [1], 1, 6>; defm : HWWriteResPair<WriteBlendY, [HWPort5], 1, [1], 1, 7>; @@ -278,7 +282,8 @@ defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2], 2, 6>; defm : HWWriteResPair<WriteVarBlendY, [HWPort5], 2, [2], 2, 7>; defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 7, [1, 2], 3, 6>; defm : HWWriteResPair<WriteMPSADY, [HWPort0, HWPort5], 7, [1, 2], 3, 7>; -defm : HWWriteResPair<WritePSADBW, [HWPort0], 5, [1], 1, 6>; +defm : HWWriteResPair<WritePSADBW, [HWPort0], 5, [1], 1, 5>; +defm : HWWriteResPair<WritePSADBWX, [HWPort0], 5, [1], 1, 6>; defm : HWWriteResPair<WritePSADBWY, [HWPort0], 5, [1], 1, 7>; defm : HWWriteResPair<WritePHMINPOS, [HWPort0], 5, [1], 1, 6>; @@ -684,7 +689,8 @@ def : InstRW<[HWWriteFXTRACT], (instrs FXTRACT)>; defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1,2], 3, 6>; defm : HWWriteResPair<WriteFHAddY, [HWPort1, HWPort5], 5, [1,2], 3, 7>; -defm : HWWriteResPair<WritePHAdd, [HWPort5, HWPort15], 3, [2,1], 3, 6>; +defm : HWWriteResPair<WritePHAdd, [HWPort5, HWPort15], 3, [2,1], 3, 5>; +defm : HWWriteResPair<WritePHAddX, [HWPort5, HWPort15], 3, [2,1], 3, 6>; defm : HWWriteResPair<WritePHAddY, [HWPort5, HWPort15], 3, [2,1], 3, 7>; //=== Floating Point XMM and YMM Instructions ===// @@ -913,26 +919,16 @@ def HWWriteResGroup12_2 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156,HWPort23]> def: InstRW<[HWWriteResGroup12_2], (instrs IMUL16m, MUL16m)>; def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> { - let Latency = 7; + let Latency = 6; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup13], (instregex "(V?)PACKSSDWrm", - "(V?)PACKSSWBrm", - "(V?)PACKUSDWrm", - "(V?)PACKUSWBrm", - "(V?)PALIGNRrmi", - "(V?)PSHUFDmi", - "(V?)PSHUFHWmi", - "(V?)PSHUFLWmi", - "(V?)PUNPCKHBWrm", - "(V?)PUNPCKHDQrm", - "(V?)PUNPCKHQDQrm", - "(V?)PUNPCKHWDrm", - "(V?)PUNPCKLBWrm", - "(V?)PUNPCKLDQrm", - "(V?)PUNPCKLQDQrm", - "(V?)PUNPCKLWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> { let Latency = 8; @@ -943,13 +939,6 @@ def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm", "VPMOVSXBQYrm", "VPMOVSXWQYrm")>; -def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm")>; - def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { let Latency = 6; let NumMicroOps = 2; @@ -974,14 +963,7 @@ def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm", "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "MOVBE(16|32|64)rm", - "MMX_PABS(B|D|W)rm", - "MMX_P(ADD|SUB)(B|D|W|Q)irm", - "MMX_P(ADD|SUB)(U?)S(B|W)irm", - "MMX_PAVG(B|W)irm", - "MMX_PCMP(EQ|GT)(B|D|W)irm", - "MMX_P(MAX|MIN)(SW|UB)irm", - "MMX_PSIGN(B|D|W)rm")>; + "MOVBE(16|32|64)rm")>; def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { let Latency = 7; @@ -992,16 +974,6 @@ def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm", "VINSERTI128rm", "VPBLENDDrmi")>; -def HWWriteResGroup17_1 : SchedWriteRes<[HWPort23,HWPort015]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDNirm", - "MMX_PANDirm", - "MMX_PORirm", - "MMX_PXORirm")>; - def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> { let Latency = 8; let NumMicroOps = 2; @@ -1356,13 +1328,6 @@ def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> { def: InstRW<[HWWriteResGroup62], (instregex "IST(T?)_FP(16|32|64)m", "IST_F(16|32)m")>; -def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[HWWriteResGroup64], (instregex "MMX_PH(ADD|SUB)(D|SW|W)rm")>; - def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { let Latency = 8; let NumMicroOps = 4; @@ -1594,13 +1559,6 @@ def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> { def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F(32|64)m", "VPCMPGTQYrm")>; -def HWWriteResGroup91_5 : SchedWriteRes<[HWPort0,HWPort23]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[HWWriteResGroup91_5], (instregex "MMX_PSADBWirm")>; - def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> { let Latency = 5; let NumMicroOps = 3; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 295c724b639..821f6575630 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -235,11 +235,13 @@ defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1] defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>; -defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>; defm : SBWriteResPair<WriteVecLogicY,[SBPort015], 1, [1], 1, 7>; defm : SBWriteResPair<WriteVecTest, [SBPort0,SBPort5], 2, [1,1], 2, 6>; defm : SBWriteResPair<WriteVecTestY, [SBPort0,SBPort5], 2, [1,1], 2, 7>; -defm : SBWriteResPair<WriteVecALU, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVecALU, [SBPort1], 3, [1], 1, 5>; +defm : SBWriteResPair<WriteVecALUX, [SBPort15], 1, [1], 1, 6>; defm : SBWriteResPair<WriteVecALUY, [SBPort15], 1, [1], 1, 7>; defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5, [1], 1, 5>; defm : SBWriteResPair<WriteVecIMulX, [SBPort0], 5, [1], 1, 6>; @@ -247,8 +249,10 @@ defm : SBWriteResPair<WriteVecIMulY, [SBPort0], 5, [1], 1, 7>; defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>; defm : SBWriteResPair<WritePMULLDY, [SBPort0], 5, [1], 1, 7>; // TODO this is probably wrong for 256/512-bit for the "generic" model defm : SBWriteResPair<WriteShuffle, [SBPort5], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteShuffleX, [SBPort15], 1, [1], 1, 6>; defm : SBWriteResPair<WriteShuffleY, [SBPort5], 1, [1], 1, 7>; -defm : SBWriteResPair<WriteVarShuffle, [SBPort15], 1, [1], 1, 6>; +defm : SBWriteResPair<WriteVarShuffle, [SBPort15], 1, [1], 1, 5>; +defm : SBWriteResPair<WriteVarShuffleX, [SBPort15], 1, [1], 1, 6>; defm : SBWriteResPair<WriteVarShuffleY, [SBPort15], 1, [1], 1, 7>; defm : SBWriteResPair<WriteBlend, [SBPort15], 1, [1], 1, 6>; defm : SBWriteResPair<WriteBlendY, [SBPort15], 1, [1], 1, 7>; @@ -256,7 +260,8 @@ defm : SBWriteResPair<WriteVarBlend, [SBPort15], 2, [2], 2, 6>; defm : SBWriteResPair<WriteVarBlendY,[SBPort15], 2, [2], 2, 7>; defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 7, [1,2], 3, 6>; defm : SBWriteResPair<WriteMPSADY, [SBPort0, SBPort15], 7, [1,2], 3, 7>; -defm : SBWriteResPair<WritePSADBW, [SBPort0], 5, [1], 1, 6>; +defm : SBWriteResPair<WritePSADBW, [SBPort0], 5, [1], 1, 5>; +defm : SBWriteResPair<WritePSADBWX, [SBPort0], 5, [1], 1, 6>; defm : SBWriteResPair<WritePSADBWY, [SBPort0], 5, [1], 1, 7>; defm : SBWriteResPair<WritePHMINPOS, [SBPort0], 5, [1], 1, 6>; @@ -295,7 +300,8 @@ def : WriteRes<WriteVecExtractSt, [SBPort4,SBPort23,SBPort15]> { defm : SBWriteResPair<WriteFHAdd, [SBPort1,SBPort5], 5, [1,2], 3, 6>; defm : SBWriteResPair<WriteFHAddY, [SBPort1,SBPort5], 5, [1,2], 3, 7>; -defm : SBWriteResPair<WritePHAdd, [SBPort15], 3, [3], 3, 6>; +defm : SBWriteResPair<WritePHAdd, [SBPort15], 3, [3], 3, 5>; +defm : SBWriteResPair<WritePHAddX, [SBPort15], 3, [3], 3, 6>; defm : SBWriteResPair<WritePHAddY, [SBPort15], 3, [3], 3, 7>; //////////////////////////////////////////////////////////////////////////////// @@ -471,37 +477,10 @@ def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNRrri", - "(V?)PACKSSDWrr", - "(V?)PACKSSWBrr", - "(V?)PACKUSDWrr", - "(V?)PACKUSWBrr", - "(V?)PALIGNRrri", - "(V?)PMOVSXBDrr", - "(V?)PMOVSXBQrr", - "(V?)PMOVSXBWrr", - "(V?)PMOVSXDQrr", - "(V?)PMOVSXWDrr", - "(V?)PMOVSXWQrr", - "(V?)PMOVZXBDrr", - "(V?)PMOVZXBQrr", - "(V?)PMOVZXBWrr", - "(V?)PMOVZXDQrr", - "(V?)PMOVZXWDrr", - "(V?)PMOVZXWQrr", - "(V?)PSHUFDri", - "(V?)PSHUFHWri", - "(V?)PSHUFLWri", - "(V?)PSLLDQri", - "(V?)PSRLDQri", - "(V?)PUNPCKHBWrr", - "(V?)PUNPCKHDQrr", - "(V?)PUNPCKHQDQrr", - "(V?)PUNPCKHWDrr", - "(V?)PUNPCKLBWrr", - "(V?)PUNPCKLDQrr", - "(V?)PUNPCKLQDQrr", - "(V?)PUNPCKLWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABS(B|D|W)rr", + "MMX_PADDQirr", + "MMX_PALIGNRrri", + "MMX_PSIGN(B|D|W)rr")>; def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> { let Latency = 1; @@ -608,12 +587,6 @@ def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { let ResourceCycles = [1]; } def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr", - "MMX_PADD(B|D|W)irr", - "MMX_P(ADD|SUB)(U?)S(B|W)irr", - "MMX_PAVG(B|W)irr", - "MMX_PCMP(EQ|GT)(B|D|W)irr", - "MMX_P(MAX|MIN)(SW|UB)irr", - "MMX_PSUB(B|D|Q|W)irr", "PUSHFS64", "(V?)CVTDQ2PS(Y?)rr")>; @@ -884,7 +857,6 @@ def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> { } def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABS(B|D|W)rm", "MMX_PALIGNRrmi", - "MMX_PSHUFBrm", "MMX_PSIGN(B|D|W)rm")>; def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { @@ -893,11 +865,7 @@ def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { let ResourceCycles = [1,1]; } def: InstRW<[SBWriteResGroup52], (instregex "LODSL", - "LODSQ", - "MMX_PANDirm", - "MMX_PANDNirm", - "MMX_PORirm", - "MMX_PXORirm")>; + "LODSQ")>; def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> { let Latency = 6; @@ -944,46 +912,7 @@ def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SBWriteResGroup59], (instregex "(V?)PACKSSDWrm", - "(V?)PACKSSWBrm", - "(V?)PACKUSDWrm", - "(V?)PACKUSWBrm", - "(V?)PALIGNRrmi", - "(V?)PMOVSXBDrm", - "(V?)PMOVSXBQrm", - "(V?)PMOVSXBWrm", - "(V?)PMOVSXDQrm", - "(V?)PMOVSXWDrm", - "(V?)PMOVSXWQrm", - "(V?)PMOVZXBDrm", - "(V?)PMOVZXBQrm", - "(V?)PMOVZXBWrm", - "(V?)PMOVZXDQrm", - "(V?)PMOVZXWDrm", - "(V?)PMOVZXWQrm", - "(V?)PSHUFDmi", - "(V?)PSHUFHWmi", - "(V?)PSHUFLWmi", - "(V?)PUNPCKHBWrm", - "(V?)PUNPCKHDQrm", - "(V?)PUNPCKHQDQrm", - "(V?)PUNPCKHWDrm", - "(V?)PUNPCKLBWrm", - "(V?)PUNPCKLDQrm", - "(V?)PUNPCKLQDQrm", - "(V?)PUNPCKLWDrm")>; - -def SBWriteResGroup59a : SchedWriteRes<[SBPort23,SBPort1]> { - let Latency = 8; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup59a], (instregex "MMX_PADD(B|D|W)irm", - "MMX_P(ADD|SUB)(U?)S(B|W)irm", - "MMX_PAVG(B|W)irm", - "MMX_PCMP(EQ|GT)(B|D|W)irm", - "MMX_P(MAX|MIN)(SW|UB)irm", - "MMX_PSUB(B|D|Q|W)irm")>; +def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>; def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> { let Latency = 7; @@ -1060,13 +989,6 @@ def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { } def: InstRW<[SBWriteResGroup77], (instregex "(V?)(U?)COMI(SD|SS)rm")>; -def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [1,3]; -} -def: InstRW<[SBWriteResGroup80], (instregex "MMX_PH(ADD|SUB)(D|SW|W)rm")>; - def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> { let Latency = 8; let NumMicroOps = 4; @@ -1134,13 +1056,6 @@ def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8", "SHRD(16|32|64)mri8")>; -def SBWriteResGroup89_2 : SchedWriteRes<[SBPort0,SBPort23]> { - let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SBWriteResGroup89_2], (instregex "MMX_PSADBWirm")>; - def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> { let Latency = 9; let NumMicroOps = 2; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index d521864f6db..9c5f7de19ef 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -256,9 +256,11 @@ defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKLPort015], 1, [1], 1>; -defm : SKLWriteResPair<WriteVecALU, [SKLPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair<WriteVecALU, [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair<WriteVecALUX, [SKLPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (XMM). defm : SKLWriteResPair<WriteVecALUY, [SKLPort01], 1, [1], 1, 7>; // Vector integer ALU op, no logicals (YMM/ZMM). -defm : SKLWriteResPair<WriteVecLogic, [SKLPort015], 1, [1], 1, 6>; // Vector integer and/or/xor. +defm : SKLWriteResPair<WriteVecLogic, [SKLPort05], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : SKLWriteResPair<WriteVecLogicX,[SKLPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (XMM). defm : SKLWriteResPair<WriteVecLogicY,[SKLPort015], 1, [1], 1, 7>; // Vector integer and/or/xor (YMM/ZMM). defm : SKLWriteResPair<WriteVecTest, [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>; // Vector integer TEST instructions (YMM/ZMM). @@ -268,17 +270,20 @@ defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01], 4, [1], 1, 7>; // Vector int defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD. defm : SKLWriteResPair<WritePMULLDY, [SKLPort01], 10, [2], 2, 7>; // Vector PMULLD (YMM/ZMM). defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKLWriteResPair<WriteShuffleX, [SKLPort5], 1, [1], 1, 6>; // Vector shuffles (XMM). defm : SKLWriteResPair<WriteShuffleY, [SKLPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM). -defm : SKLWriteResPair<WriteVarShuffle, [SKLPort5], 1, [1], 1, 6>; // Vector shuffles. +defm : SKLWriteResPair<WriteVarShuffle, [SKLPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKLWriteResPair<WriteVarShuffleX, [SKLPort5], 1, [1], 1, 6>; // Vector shuffles (XMM). defm : SKLWriteResPair<WriteVarShuffleY, [SKLPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM). defm : SKLWriteResPair<WriteBlend, [SKLPort5], 1, [1], 1, 6>; // Vector blends. defm : SKLWriteResPair<WriteBlendY, [SKLPort5], 1, [1], 1, 7>; // Vector blends (YMM/ZMM). defm : SKLWriteResPair<WriteVarBlend, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends. defm : SKLWriteResPair<WriteVarBlendY, [SKLPort015], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM). defm : SKLWriteResPair<WriteMPSAD, [SKLPort5], 4, [2], 2, 6>; // Vector MPSAD. -defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>; // Vector MPSAD. -defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 6>; // Vector PSADBW. -defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>; // Vector PSADBW. +defm : SKLWriteResPair<WriteMPSADY, [SKLPort5], 4, [2], 2, 7>; // Vector MPSAD (YMM/ZMM). +defm : SKLWriteResPair<WritePSADBW, [SKLPort5], 3, [1], 1, 5>; // Vector PSADBW. +defm : SKLWriteResPair<WritePSADBWX, [SKLPort5], 3, [1], 1, 6>; // Vector PSADBW (XMM). +defm : SKLWriteResPair<WritePSADBWY, [SKLPort5], 3, [1], 1, 7>; // Vector PSADBW (YMM/ZMM). defm : SKLWriteResPair<WritePHMINPOS, [SKLPort01], 4, [1], 1, 6>; // Vector PHMINPOS. // Vector integer shifts. @@ -450,7 +455,8 @@ def : WriteRes<WriteNop, []>; defm : SKLWriteResPair<WriteFHAdd, [SKLPort5,SKLPort01], 6, [2,1], 3, 6>; defm : SKLWriteResPair<WriteFHAddY, [SKLPort5,SKLPort01], 6, [2,1], 3, 7>; -defm : SKLWriteResPair<WritePHAdd, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>; +defm : SKLWriteResPair<WritePHAdd, [SKLPort5,SKLPort05], 3, [2,1], 3, 5>; +defm : SKLWriteResPair<WritePHAddX, [SKLPort5,SKLPort015], 3, [2,1], 3, 6>; defm : SKLWriteResPair<WritePHAddY, [SKLPort5,SKLPort015], 3, [2,1], 3, 7>; // Remaining instrs. @@ -497,15 +503,7 @@ def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> { let ResourceCycles = [1]; } def: InstRW<[SKLWriteResGroup6], (instrs FINCSTP, FNOP)>; -def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr", - "MMX_PABS(B|D|W)rr", - "MMX_PADD(B|D|Q|W)irr", - "MMX_PANDNirr", - "MMX_PANDirr", - "MMX_PORirr", - "MMX_PSIGN(B|D|W)rr", - "MMX_PSUB(B|D|Q|W)irr", - "MMX_PXORirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr")>; def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> { let Latency = 1; @@ -768,13 +766,6 @@ def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> { def: InstRW<[SKLWriteResGroup36], (instregex "(V?)PHADDSW(Y?)rr", "(V?)PHSUBSW(Y?)rr")>; -def SKLWriteResGroup37 : SchedWriteRes<[SKLPort5,SKLPort05]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [2,1]; -} -def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PH(ADD|SUB)(D|W)rr")>; - def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -1037,20 +1028,6 @@ def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> { def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64", "JMP(16|32|64)m")>; -def SKLWriteResGroup73 : SchedWriteRes<[SKLPort23,SKLPort05]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABS(B|D|W)rm", - "MMX_PADD(B|D|Q|W)irm", - "MMX_PANDNirm", - "MMX_PANDirm", - "MMX_PORirm", - "MMX_PSIGN(B|D|W)rm", - "MMX_PSUB(B|D|Q|W)irm", - "MMX_PXORirm")>; - def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> { let Latency = 6; let NumMicroOps = 2; @@ -1165,35 +1142,16 @@ def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> { def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>; def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> { - let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PACKSSDWrm", - "(V?)PACKSSWBrm", - "(V?)PACKUSDWrm", - "(V?)PACKUSWBrm", - "(V?)PALIGNRrmi", - "VPBROADCASTBrm", - "VPBROADCASTWrm", - "(V?)PSHUFDmi", - "(V?)PSHUFHWmi", - "(V?)PSHUFLWmi", - "(V?)PUNPCKHBWrm", - "(V?)PUNPCKHDQrm", - "(V?)PUNPCKHQDQrm", - "(V?)PUNPCKHWDrm", - "(V?)PUNPCKLBWrm", - "(V?)PUNPCKLDQrm", - "(V?)PUNPCKLQDQrm", - "(V?)PUNPCKLWDrm")>; - -def SKLWriteResGroup88a : SchedWriteRes<[SKLPort5,SKLPort23]> { let Latency = 6; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup88a], (instregex "MMX_PSHUFBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort01]> { let Latency = 7; @@ -1326,7 +1284,6 @@ def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m", "FCOM64m", "FCOMP32m", "FCOMP64m", - "MMX_PSADBWirm", // TODO - SKLWriteResGroup120?? "VPBROADCASTBYrm", "VPBROADCASTWYrm", "VPMOVSXBDYrm", @@ -1349,13 +1306,6 @@ def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { } def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PH(ADD|SUB)SWrm")>; -def SKLWriteResGroup113 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PH(ADD|SUB)(D|W)rm")>; - def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort01]> { let Latency = 8; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index 3f0b51d140d..59c773ec769 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -256,9 +256,11 @@ defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKXPort015], 1, [1], 1>; -defm : SKXWriteResPair<WriteVecALU, [SKXPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair<WriteVecALU, [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair<WriteVecALUX, [SKXPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (XMM). defm : SKXWriteResPair<WriteVecALUY, [SKXPort01], 1, [1], 1, 7>; // Vector integer ALU op, no logicals (YMM/ZMM). -defm : SKXWriteResPair<WriteVecLogic, [SKXPort015], 1, [1], 1, 6>; // Vector integer and/or/xor. +defm : SKXWriteResPair<WriteVecLogic, [SKXPort05], 1, [1], 1, 5>; // Vector integer and/or/xor. +defm : SKXWriteResPair<WriteVecLogicX,[SKXPort015], 1, [1], 1, 6>; // Vector integer and/or/xor (XMM). defm : SKXWriteResPair<WriteVecLogicY,[SKXPort015], 1, [1], 1, 7>; // Vector integer and/or/xor (YMM/ZMM). defm : SKXWriteResPair<WriteVecTest, [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions. defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>; // Vector integer TEST instructions (YMM/ZMM). @@ -268,8 +270,10 @@ defm : SKXWriteResPair<WriteVecIMulY, [SKXPort015], 4, [1], 1, 7>; // Vector in defm : SKXWriteResPair<WritePMULLD, [SKXPort015], 10, [2], 2, 6>; // Vector PMULLD. defm : SKXWriteResPair<WritePMULLDY, [SKXPort015], 10, [2], 2, 7>; // Vector PMULLD (YMM/ZMM). defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector shuffles. +defm : SKXWriteResPair<WriteShuffleX, [SKXPort5], 1, [1], 1, 6>; // Vector shuffles (XMM). defm : SKXWriteResPair<WriteShuffleY, [SKXPort5], 1, [1], 1, 7>; // Vector shuffles (YMM/ZMM). -defm : SKXWriteResPair<WriteVarShuffle, [SKXPort5], 1, [1], 1, 6>; // Vector variable shuffles. +defm : SKXWriteResPair<WriteVarShuffle, [SKXPort5], 1, [1], 1, 5>; // Vector variable shuffles. +defm : SKXWriteResPair<WriteVarShuffleX, [SKXPort5], 1, [1], 1, 6>; // Vector variable shuffles (XMM). defm : SKXWriteResPair<WriteVarShuffleY, [SKXPort5], 1, [1], 1, 7>; // Vector variable shuffles (YMM/ZMM). defm : SKXWriteResPair<WriteBlend, [SKXPort5], 1, [1], 1, 6>; // Vector blends. defm : SKXWriteResPair<WriteBlendY,[SKXPort5], 1, [1], 1, 7>; // Vector blends (YMM/ZMM). @@ -277,7 +281,8 @@ defm : SKXWriteResPair<WriteVarBlend, [SKXPort015], 2, [2], 2, 6>; // Vector var defm : SKXWriteResPair<WriteVarBlendY,[SKXPort015], 2, [2], 2, 6>; // Vector variable blends (YMM/ZMM). defm : SKXWriteResPair<WriteMPSAD, [SKXPort5], 4, [2], 2, 6>; // Vector MPSAD. defm : SKXWriteResPair<WriteMPSADY, [SKXPort5], 4, [2], 2, 7>; // Vector MPSAD. -defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 6>; // Vector PSADBW. +defm : SKXWriteResPair<WritePSADBW, [SKXPort5], 3, [1], 1, 5>; // Vector PSADBW. +defm : SKXWriteResPair<WritePSADBWX, [SKXPort5], 3, [1], 1, 6>; // Vector PSADBW. defm : SKXWriteResPair<WritePSADBWY, [SKXPort5], 3, [1], 1, 7>; // Vector PSADBW. defm : SKXWriteResPair<WritePHMINPOS, [SKXPort015], 4, [1], 1, 6>; // Vector PHMINPOS. @@ -450,7 +455,8 @@ def : WriteRes<WriteNop, []>; defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>; defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>; -defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>; +defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>; +defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>; defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>; // Remaining instrs. @@ -511,15 +517,7 @@ def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> { let ResourceCycles = [1]; } def: InstRW<[SKXWriteResGroup6], (instrs FINCSTP, FNOP)>; -def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr", - "MMX_PABS(B|D|W)rr", - "MMX_PADD(B|D|Q|W)irr", - "MMX_PANDNirr", - "MMX_PANDirr", - "MMX_PORirr", - "MMX_PSIGN(B|D|W)rr", - "MMX_PSUB(B|D|Q|W)irr", - "MMX_PXORirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr")>; def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> { let Latency = 1; @@ -601,7 +599,6 @@ def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm", "MMX_MOVD64mr", "MMX_MOVNTQmr", "MMX_MOVQ64mr", - "MOVNTDQmr", "MOVNTI_64mr", "MOVNTImr", "ST_FP32m", @@ -847,13 +844,6 @@ def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> { } def: InstRW<[SKXWriteResGroup38], (instregex "(V?)PH(ADD|SUB)SW(Y?)rr")>; -def SKXWriteResGroup39 : SchedWriteRes<[SKXPort5,SKXPort05]> { - let Latency = 3; - let NumMicroOps = 3; - let ResourceCycles = [2,1]; -} -def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PH(ADD|SUB)(D|W)rr")>; - def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> { let Latency = 3; let NumMicroOps = 3; @@ -1250,20 +1240,6 @@ def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> { def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64", "JMP(16|32|64)m")>; -def SKXWriteResGroup77 : SchedWriteRes<[SKXPort23,SKXPort05]> { - let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [1,1]; -} -def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABS(B|D|W)rm", - "MMX_PADD(B|D|Q|W)irm", - "MMX_PANDNirm", - "MMX_PANDirm", - "MMX_PORirm", - "MMX_PSIGN(B|D|W)rm", - "MMX_PSUB(B|D|Q|W)irm", - "MMX_PXORirm")>; - def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> { let Latency = 6; let NumMicroOps = 2; @@ -1394,52 +1370,19 @@ def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> { let ResourceCycles = [1,1]; } def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)", - "VMOVSSZrm(b?)", - "VPACKSSDWZ128rm(b?)", - "(V?)PACKSSDWrm", - "VPACKSSWBZ128rm(b?)", - "(V?)PACKSSWBrm", - "VPACKUSDWZ128rm(b?)", - "(V?)PACKUSDWrm", - "VPACKUSWBZ128rm(b?)", - "(V?)PACKUSWBrm", - "VPALIGNRZ128rmi(b?)", - "(V?)PALIGNRrmi", - "VPBROADCASTBZ128m(b?)", - "VPBROADCASTBrm", - "VPBROADCASTWZ128m(b?)", - "VPBROADCASTWrm", - "VPSHUFDZ128m(b?)i", - "(V?)PSHUFDmi", - "VPSHUFHWZ128mi(b?)", - "(V?)PSHUFHWmi", - "VPSHUFLWZ128mi(b?)", - "(V?)PSHUFLWmi", - "VPSLLDQZ128rm(b?)", - "VPSRLDQZ128rm(b?)", - "VPUNPCKHBWZ128rm(b?)", - "(V?)PUNPCKHBWrm", - "VPUNPCKHDQZ128rm(b?)", - "(V?)PUNPCKHDQrm", - "VPUNPCKHQDQZ128rm(b?)", - "(V?)PUNPCKHQDQrm", - "VPUNPCKHWDZ128rm(b?)", - "(V?)PUNPCKHWDrm", - "VPUNPCKLBWZ128rm(b?)", - "(V?)PUNPCKLBWrm", - "VPUNPCKLDQZ128rm(b?)", - "(V?)PUNPCKLDQrm", - "VPUNPCKLQDQZ128rm(b?)", - "(V?)PUNPCKLQDQrm", - "VPUNPCKLWDZ128rm(b?)", - "(V?)PUNPCKLWDrm")>; + "VMOVSSZrm(b?)")>; def SKXWriteResGroup92a : SchedWriteRes<[SKXPort5,SKXPort23]> { let Latency = 6; let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup92a], (instregex "MMX_PSHUFBrm")>; +def: InstRW<[SKXWriteResGroup92a], (instregex "(V?)PMOV(SX|ZX)BDrm", + "(V?)PMOV(SX|ZX)BQrm", + "(V?)PMOV(SX|ZX)BWrm", + "(V?)PMOV(SX|ZX)DQrm", + "(V?)PMOV(SX|ZX)WDrm", + "(V?)PMOV(SX|ZX)WQrm")>; def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> { let Latency = 7; @@ -1676,7 +1619,6 @@ def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> { } def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)32m", "FCOM(P?)64m", - "MMX_PSADBWirm", "VFPCLASSSDrm(b?)", "VPBROADCASTBYrm", "VPBROADCASTB(Z|Z256)m(b?)", @@ -1751,13 +1693,6 @@ def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { } def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PH(ADD|SUB)SWrm")>; -def SKXWriteResGroup124 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort05]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [2,1,1]; -} -def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PH(ADD|SUB)(D|W)rm")>; - def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { let Latency = 8; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 3a5f324ea59..937c3495eba 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -194,6 +194,7 @@ class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } defm WriteFHAdd : X86SchedWritePair; defm WriteFHAddY : X86SchedWritePair; // YMM/ZMM. defm WritePHAdd : X86SchedWritePair; +defm WritePHAddX : X86SchedWritePair; // XMM. defm WritePHAddY : X86SchedWritePair; // YMM/ZMM. // Vector integer operations. @@ -205,10 +206,12 @@ def WriteVecMaskedStore : SchedWrite; def WriteVecMaskedStoreY : SchedWrite; def WriteVecMove : SchedWrite; -defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. -defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM/ZMM). -defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals. -defm WriteVecLogicY: X86SchedWritePair; // Vector integer and/or/xor logicals (YMM/ZMM). +defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. +defm WriteVecALUX : X86SchedWritePair; // Vector integer ALU op, no logicals (XMM). +defm WriteVecALUY : X86SchedWritePair; // Vector integer ALU op, no logicals (YMM/ZMM). +defm WriteVecLogic : X86SchedWritePair; // Vector integer and/or/xor logicals. +defm WriteVecLogicX : X86SchedWritePair; // Vector integer and/or/xor logicals (XMM). +defm WriteVecLogicY : X86SchedWritePair; // Vector integer and/or/xor logicals (YMM/ZMM). defm WriteVecTest : X86SchedWritePair; // Vector integer TEST instructions. defm WriteVecTestY : X86SchedWritePair; // Vector integer TEST instructions (YMM/ZMM). defm WriteVecShift : X86SchedWritePair; // Vector integer shifts (default). @@ -223,14 +226,17 @@ defm WriteVecIMulY : X86SchedWritePair; // Vector integer multiply (YMM/ZMM). defm WritePMULLD : X86SchedWritePair; // Vector PMULLD. defm WritePMULLDY : X86SchedWritePair; // Vector PMULLD (YMM/ZMM). defm WriteShuffle : X86SchedWritePair; // Vector shuffles. +defm WriteShuffleX : X86SchedWritePair; // Vector shuffles (XMM). defm WriteShuffleY : X86SchedWritePair; // Vector shuffles (YMM/ZMM). defm WriteVarShuffle : X86SchedWritePair; // Vector variable shuffles. +defm WriteVarShuffleX : X86SchedWritePair; // Vector variable shuffles (XMM). defm WriteVarShuffleY : X86SchedWritePair; // Vector variable shuffles (YMM/ZMM). defm WriteBlend : X86SchedWritePair; // Vector blends. defm WriteBlendY : X86SchedWritePair; // Vector blends (YMM/ZMM). defm WriteVarBlend : X86SchedWritePair; // Vector variable blends. defm WriteVarBlendY : X86SchedWritePair; // Vector variable blends (YMM/ZMM). defm WritePSADBW : X86SchedWritePair; // Vector PSADBW. +defm WritePSADBWX : X86SchedWritePair; // Vector PSADBW (XMM). defm WritePSADBWY : X86SchedWritePair; // Vector PSADBW (YMM/ZMM). defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. defm WriteMPSADY : X86SchedWritePair; // Vector MPSAD (YMM/ZMM). @@ -356,11 +362,11 @@ def SchedWriteFVarBlend WriteFVarBlendY, WriteFVarBlendY>; def SchedWriteVecALU - : X86SchedWriteWidths<WriteVecALU, WriteVecALU, WriteVecALUY, WriteVecALUY>; + : X86SchedWriteWidths<WriteVecALU, WriteVecALUX, WriteVecALUY, WriteVecALUY>; def SchedWritePHAdd - : X86SchedWriteWidths<WritePHAdd, WritePHAdd, WritePHAddY, WritePHAddY>; + : X86SchedWriteWidths<WritePHAdd, WritePHAddX, WritePHAddY, WritePHAddY>; def SchedWriteVecLogic - : X86SchedWriteWidths<WriteVecLogic, WriteVecLogic, + : X86SchedWriteWidths<WriteVecLogic, WriteVecLogicX, WriteVecLogicY, WriteVecLogicY>; def SchedWriteVecTest : X86SchedWriteWidths<WriteVecTest, WriteVecTest, @@ -384,14 +390,14 @@ def SchedWriteMPSAD : X86SchedWriteWidths<WriteMPSAD, WriteMPSAD, WriteMPSADY, WriteMPSADY>; def SchedWritePSADBW - : X86SchedWriteWidths<WritePSADBW, WritePSADBW, + : X86SchedWriteWidths<WritePSADBW, WritePSADBWX, WritePSADBWY, WritePSADBWY>; def SchedWriteShuffle - : X86SchedWriteWidths<WriteShuffle, WriteShuffle, + : X86SchedWriteWidths<WriteShuffle, WriteShuffleX, WriteShuffleY, WriteShuffleY>; def SchedWriteVarShuffle - : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffle, + : X86SchedWriteWidths<WriteVarShuffle, WriteVarShuffleX, WriteVarShuffleY, WriteVarShuffleY>; def SchedWriteBlend : X86SchedWriteWidths<WriteBlend, WriteBlend, WriteBlendY, WriteBlendY>; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 91ebc5f32cb..00721ec3057 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -281,8 +281,10 @@ def : WriteRes<WriteVecMaskedStoreY, [AtomPort0]>; def : WriteRes<WriteVecMove, [AtomPort01]>; defm : AtomWriteResPair<WriteVecALU, [AtomPort01], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVecALUX, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecALUY, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecLogic, [AtomPort01], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVecLogicX, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecLogicY, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecTest, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecTestY, [AtomPort01], [AtomPort0], 1, 1>; @@ -300,11 +302,14 @@ defm : AtomWriteResPair<WritePMULLDY, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WritePHMINPOS, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : AtomWriteResPair<WriteMPSAD, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteMPSADY, [AtomPort01], [AtomPort0], 1, 1>; -defm : AtomWriteResPair<WritePSADBW, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; +defm : AtomWriteResPair<WritePSADBW, [AtomPort01], [AtomPort01], 4, 4, [4], [4]>; +defm : AtomWriteResPair<WritePSADBWX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : AtomWriteResPair<WritePSADBWY, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : AtomWriteResPair<WriteShuffle, [AtomPort0], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteShuffleX, [AtomPort0], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteShuffleY, [AtomPort0], [AtomPort0], 1, 1>; -defm : AtomWriteResPair<WriteVarShuffle, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>; +defm : AtomWriteResPair<WriteVarShuffle, [AtomPort0], [AtomPort0], 1, 1>; +defm : AtomWriteResPair<WriteVarShuffleX, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>; defm : AtomWriteResPair<WriteVarShuffleY, [AtomPort01], [AtomPort01], 4, 5, [4], [5]>; defm : AtomWriteResPair<WriteBlend, [AtomPort0], [AtomPort0]>; // NOTE: Doesn't exist on Atom. defm : AtomWriteResPair<WriteBlendY, [AtomPort0], [AtomPort0]>; // NOTE: Doesn't exist on Atom. @@ -355,7 +360,8 @@ defm : AtomWriteResPair<WriteAESDecEnc, [AtomPort01], [AtomPort01]>; // NOTE: Do defm : AtomWriteResPair<WriteFHAdd, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; defm : AtomWriteResPair<WriteFHAddY, [AtomPort01], [AtomPort01], 8, 9, [8], [9]>; -defm : AtomWriteResPair<WritePHAdd, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; +defm : AtomWriteResPair<WritePHAdd, [AtomPort01], [AtomPort01], 3, 4, [3], [4]>; +defm : AtomWriteResPair<WritePHAddX, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; defm : AtomWriteResPair<WritePHAddY, [AtomPort01], [AtomPort01], 7, 8, [7], [8]>; //////////////////////////////////////////////////////////////////////////////// @@ -387,7 +393,6 @@ def : InstRW<[AtomWrite0_1], (instrs FXAM, LD_Frr, MOVSX64rr32, MMX_MOVD64rr, MMX_MOVD64to64rr, - MMX_PSHUFBrr, MMX_PSHUFBrm, MOVDI2PDIrr, MOVDI2SSrr, MOV64toPQIrr, @@ -492,7 +497,7 @@ def : InstRW<[AtomWrite01_3], (instrs CLD, LDDQUrm, POP16rmm, POP32rmm, POP64rmm)>; def : InstRW<[AtomWrite01_3], (instregex "XADD(8|16|32|64)rm", "XCHG(8|16|32|64)rm", - "(MMX_)?PH(ADD|SUB)Drr", + "PH(ADD|SUB)Drr", "MOV(S|Z)X16rm8", "MMX_P(ADD|SUB)Qirm", "MOV(UPS|UPD|DQU)rm", @@ -506,9 +511,8 @@ def : InstRW<[AtomWrite01_4], (instrs CBW, CWD, CWDE, CDQ, CDQE, CQO, JCXZ, JECXZ, JRCXZ, SHLD32mrCL, SHRD32mrCL, SHLD32mri8, SHRD32mri8, - LD_F80m, - MMX_PSADBWirr, MMX_PSADBWirm)>; -def : InstRW<[AtomWrite01_4], (instregex "(MMX_)?PH(ADD|SUB)Drm", + LD_F80m)>; +def : InstRW<[AtomWrite01_4], (instregex "PH(ADD|SUB)Drm", "(MMX_)?PEXTRWrr(_REV)?")>; def AtomWrite01_5 : SchedWriteRes<[AtomPort01]> { diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 4004eff4593..18d4d324888 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -407,6 +407,7 @@ defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], def : WriteRes<WriteVecMove, [JFPU01, JVALU]>; defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVecALUY, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVecShiftX, [JFPU01, JVALU], 1>; @@ -422,17 +423,21 @@ defm : JWriteResFpuPair<WritePMULLDY, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2]>; defm : JWriteResFpuPair<WriteMPSADY, [JFPU0, JVIMUL], 3, [1, 2]>; defm : JWriteResFpuPair<WritePSADBW, [JFPU01, JVALU], 2>; +defm : JWriteResFpuPair<WritePSADBWX, [JFPU01, JVALU], 2>; defm : JWriteResFpuPair<WritePSADBWY, [JFPU01, JVALU], 2>; defm : JWriteResFpuPair<WritePHMINPOS, [JFPU0, JVALU], 2>; defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteShuffleX, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteShuffleY, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVarShuffle, [JFPU01, JVALU], 2, [1, 4], 3>; +defm : JWriteResFpuPair<WriteVarShuffleX, [JFPU01, JVALU], 2, [1, 4], 3>; defm : JWriteResFpuPair<WriteVarShuffleY, [JFPU01, JVALU], 2, [1, 4], 3>; defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteBlendY, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVarBlend, [JFPU01, JVALU], 2, [1, 4], 3>; defm : JWriteResFpuPair<WriteVarBlendY, [JFPU01, JVALU], 2, [1, 4], 3>; defm : JWriteResFpuPair<WriteVecLogic, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WriteVecLogicX, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVecLogicY, [JFPU01, JVALU], 1>; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair<WriteVecTest, [JFPU0, JFPA, JALU0], 3>; defm : JWriteResYMMPair<WriteVecTestY , [JFPU01, JFPX, JFPA, JALU0], 4, [2, 2, 2, 1], 3>; @@ -482,6 +487,7 @@ defm : JWriteResFpuPair<WriteAESDecEnc, [JFPU0, JVIMUL], 3, [1, 1], 2>; defm : JWriteResFpuPair<WriteFHAdd, [JFPU0, JFPA], 3>; defm : JWriteResYMMPair<WriteFHAddY, [JFPU0, JFPA], 3, [2,2], 2>; defm : JWriteResFpuPair<WritePHAdd, [JFPU01, JVALU], 1>; +defm : JWriteResFpuPair<WritePHAddX, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WritePHAddY, [JFPU01, JVALU], 1>; //////////////////////////////////////////////////////////////////////////////// diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 92430746a8b..ef5a0f3551b 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -219,10 +219,12 @@ defm : SLMWriteResPair<WriteVecShiftImm, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVecShiftImmX,[SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVecShiftImmY,[SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecLogicX,[SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecLogicY,[SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecTest, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecTestY, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WriteVecALUX, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecALUY, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>; defm : SLMWriteResPair<WriteVecIMulX, [SLM_FPC_RSV0], 4>; @@ -233,13 +235,16 @@ defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 4>; defm : SLMWriteResPair<WritePMULLDY, [SLM_FPC_RSV0], 4>; defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteShuffleY, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteShuffleX, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVarShuffle, [SLM_FPC_RSV0], 1>; +defm : SLMWriteResPair<WriteVarShuffleX, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVarShuffleY, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteBlendY, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>; defm : SLMWriteResPair<WriteMPSADY, [SLM_FPC_RSV0], 7>; defm : SLMWriteResPair<WritePSADBW, [SLM_FPC_RSV0], 4>; +defm : SLMWriteResPair<WritePSADBWX, [SLM_FPC_RSV0], 4>; defm : SLMWriteResPair<WritePSADBWY, [SLM_FPC_RSV0], 4>; defm : SLMWriteResPair<WritePHMINPOS, [SLM_FPC_RSV0], 4>; @@ -260,6 +265,7 @@ def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> { defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 3, [2]>; defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 3, [2]>; defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>; +defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>; defm : SLMWriteResPair<WritePHAddY, [SLM_FPC_RSV01], 1>; // String instructions. diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index b7abbee2652..6f2448fe01c 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -281,10 +281,12 @@ defm : ZnWriteResFpuPair<WriteVecShiftImm, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecShiftImmX, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecShiftImmY, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecLogicX, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecLogicY, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecTest, [ZnFPU12], 1, [2], 1, 7, 1>; defm : ZnWriteResFpuPair<WriteVecTestY, [ZnFPU12], 1, [2], 1, 7, 1>; defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVecALUX, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecALUY, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>; defm : ZnWriteResFpuPair<WriteVecIMulX, [ZnFPU0], 4>; @@ -292,14 +294,17 @@ defm : ZnWriteResFpuPair<WriteVecIMulY, [ZnFPU0], 4>; defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4>; // FIXME defm : ZnWriteResFpuPair<WritePMULLDY, [ZnFPU0], 5, [2]>; // FIXME defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteShuffleX, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteShuffleY, [ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVarShuffle, [ZnFPU], 1>; +defm : ZnWriteResFpuPair<WriteVarShuffleX,[ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteVarShuffleY,[ZnFPU], 1>; defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>; defm : ZnWriteResFpuPair<WriteBlendY, [ZnFPU01], 1>; defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>; defm : ZnWriteResFpuPair<WriteVarShuffle256, [ZnFPU], 2>; defm : ZnWriteResFpuPair<WritePSADBW, [ZnFPU0], 3>; +defm : ZnWriteResFpuPair<WritePSADBWX, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WritePSADBWY, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WritePHMINPOS, [ZnFPU0], 4>; @@ -1046,6 +1051,8 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; // PHADD|PHSUB (S) W/D. def : SchedAlias<WritePHAdd, ZnWriteMicrocoded>; def : SchedAlias<WritePHAddLd, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddX, ZnWriteMicrocoded>; +def : SchedAlias<WritePHAddXLd, ZnWriteMicrocoded>; def : SchedAlias<WritePHAddY, ZnWriteMicrocoded>; def : SchedAlias<WritePHAddYLd, ZnWriteMicrocoded>; diff --git a/llvm/test/CodeGen/X86/3dnow-schedule.ll b/llvm/test/CodeGen/X86/3dnow-schedule.ll index 6de1626795f..2c33d686e5a 100644 --- a/llvm/test/CodeGen/X86/3dnow-schedule.ll +++ b/llvm/test/CodeGen/X86/3dnow-schedule.ll @@ -14,8 +14,8 @@ declare void @llvm.x86.mmx.femms() nounwind readnone define i64 @test_pavgusb(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize { ; CHECK-LABEL: test_pavgusb: ; CHECK: # %bb.0: -; CHECK-NEXT: pavgusb %mm1, %mm0 # sched: [1:0.50] -; CHECK-NEXT: pavgusb (%rdi), %mm0 # sched: [7:0.50] +; CHECK-NEXT: pavgusb %mm1, %mm0 # sched: [3:1.00] +; CHECK-NEXT: pavgusb (%rdi), %mm0 # sched: [8:1.00] ; CHECK-NEXT: movq %mm0, %rax # sched: [1:0.33] ; CHECK-NEXT: retq # sched: [1:1.00] %1 = call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %a0, x86_mmx %a1) diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index 60dc340b817..26c318e478c 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -1761,8 +1761,8 @@ define <16 x i16> @test_pblendw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) define <16 x i8> @test_pbroadcastb(<16 x i8> %a0, <16 x i8> *%a1) { ; GENERIC-LABEL: test_pbroadcastb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastb %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpbroadcastb (%rdi), %xmm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1811,7 +1811,7 @@ define <32 x i8> @test_pbroadcastb_ymm(<32 x i8> %a0, <32 x i8> *%a1) { ; GENERIC-LABEL: test_pbroadcastb_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastb %xmm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastb (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1859,8 +1859,8 @@ define <32 x i8> @test_pbroadcastb_ymm(<32 x i8> %a0, <32 x i8> *%a1) { define <4 x i32> @test_pbroadcastd(<4 x i32> %a0, <4 x i32> *%a1) { ; GENERIC-LABEL: test_pbroadcastd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastd %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpbroadcastd (%rdi), %xmm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1909,7 +1909,7 @@ define <8 x i32> @test_pbroadcastd_ymm(<8 x i32> %a0, <8 x i32> *%a1) { ; GENERIC-LABEL: test_pbroadcastd_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastd %xmm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastd (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1957,8 +1957,8 @@ define <8 x i32> @test_pbroadcastd_ymm(<8 x i32> %a0, <8 x i32> *%a1) { define <2 x i64> @test_pbroadcastq(<2 x i64> %a0, <2 x i64> *%a1) { ; GENERIC-LABEL: test_pbroadcastq: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastq %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpbroadcastq (%rdi), %xmm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2007,7 +2007,7 @@ define <4 x i64> @test_pbroadcastq_ymm(<4 x i64> %a0, <4 x i64> *%a1) { ; GENERIC-LABEL: test_pbroadcastq_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastq %xmm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastq (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2055,8 +2055,8 @@ define <4 x i64> @test_pbroadcastq_ymm(<4 x i64> %a0, <4 x i64> *%a1) { define <8 x i16> @test_pbroadcastw(<8 x i16> %a0, <8 x i16> *%a1) { ; GENERIC-LABEL: test_pbroadcastw: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastw %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpbroadcastw (%rdi), %xmm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2105,7 +2105,7 @@ define <16 x i16> @test_pbroadcastw_ymm(<16 x i16> %a0, <16 x i16> *%a1) { ; GENERIC-LABEL: test_pbroadcastw_ymm: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpbroadcastw %xmm0, %ymm0 # sched: [1:1.00] -; GENERIC-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [6:1.00] +; GENERIC-NEXT: vpbroadcastw (%rdi), %ymm1 # sched: [7:0.50] ; GENERIC-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; diff --git a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll index 5c9dfad6659..a210b63ba02 100755 --- a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -5657,7 +5657,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5677,7 +5677,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask0(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,7,6] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask0: @@ -5694,7 +5694,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5714,7 +5714,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask1(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,3,0,0,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask1: @@ -5731,7 +5731,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5751,7 +5751,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask2(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,4,4,5] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask2: @@ -5781,7 +5781,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5801,7 +5801,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask3(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[2,1,1,1,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask3: @@ -5818,7 +5818,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5838,7 +5838,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask4(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,5,5,7,6] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask4: @@ -5855,7 +5855,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5875,7 +5875,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask5(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[3,3,2,1,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask5: @@ -5905,7 +5905,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> %v ; GENERIC-LABEL: test_masked_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm1 {%k1} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5925,7 +5925,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mask6(<8 x i16> %vec, <8 x i16> ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = xmm0[0,1,2,3,6,5,6,5] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mask6: @@ -5942,7 +5942,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> %ve ; GENERIC-LABEL: test_masked_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5962,7 +5962,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mask7(<8 x i16> %vec, <8 x i16> % ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0,4,5,6,7] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mask7: @@ -5993,7 +5993,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask0: @@ -6012,7 +6012,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask0(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,7,4,6] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask0: @@ -6031,7 +6031,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask1: @@ -6050,7 +6050,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask1(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[1,3,3,2,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask1: @@ -6069,7 +6069,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask2: @@ -6088,7 +6088,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask2(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,6,6,5,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask2: @@ -6121,7 +6121,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask3: @@ -6140,7 +6140,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask3(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[3,1,2,0,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask3: @@ -6159,7 +6159,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask4: @@ -6178,7 +6178,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask4(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,6,7,5] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask4: @@ -6197,7 +6197,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask5: @@ -6216,7 +6216,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask5(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[2,1,3,2,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask5: @@ -6249,7 +6249,7 @@ define <8 x i16> @test_masked_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i16 ; GENERIC-LABEL: test_masked_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_high_mem_mask6: @@ -6268,7 +6268,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_high_mem_mask6(<8 x i16>* %vp, <8 x i ; GENERIC-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [6:1.00] +; GENERIC-NEXT: vpshufhw {{.*#+}} xmm0 {%k1} {z} = mem[0,1,2,3,7,4,4,4] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_high_mem_mask6: @@ -6287,7 +6287,7 @@ define <8 x i16> @test_masked_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i16> ; GENERIC-LABEL: test_masked_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_8xi16_perm_low_mem_mask7: @@ -6306,7 +6306,7 @@ define <8 x i16> @test_masked_z_8xi16_perm_low_mem_mask7(<8 x i16>* %vp, <8 x i1 ; GENERIC-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmw %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [6:1.00] +; GENERIC-NEXT: vpshuflw {{.*#+}} xmm0 {%k1} {z} = mem[0,3,3,1,4,5,6,7] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_8xi16_perm_low_mem_mask7: @@ -7704,7 +7704,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[2,3,3,0] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7724,7 +7724,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask0(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3,3,0] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask0: @@ -7741,7 +7741,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,0,2,0] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7761,7 +7761,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask1(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,0,2,0] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask1: @@ -7778,7 +7778,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[3,0,1,0] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7798,7 +7798,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask2(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[3,0,1,0] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask2: @@ -7828,7 +7828,7 @@ define <4 x i32> @test_masked_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %vec2, ; GENERIC-LABEL: test_masked_4xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm2, %xmm2, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm1 {%k1} = xmm0[1,1,0,3] sched: [1:0.50] ; GENERIC-NEXT: vmovdqa %xmm1, %xmm0 # sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -7848,7 +7848,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mask3(<4 x i32> %vec, <4 x i32> %mask ; GENERIC-LABEL: test_masked_z_4xi32_perm_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,1,0,3] sched: [1:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mask3: @@ -7879,7 +7879,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,1,3,3] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask0: @@ -7898,7 +7898,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask0(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask0: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,1,3,3] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask0: @@ -7917,7 +7917,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[2,2,3,1] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask1: @@ -7936,7 +7936,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask1(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask1: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[2,2,3,1] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask1: @@ -7955,7 +7955,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[0,3,0,1] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask2: @@ -7974,7 +7974,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask2(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask2: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[0,3,0,1] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask2: @@ -8007,7 +8007,7 @@ define <4 x i32> @test_masked_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> %ve ; GENERIC-LABEL: test_masked_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm1, %xmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} = mem[1,0,1,0] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_4xi32_perm_mem_mask3: @@ -8026,7 +8026,7 @@ define <4 x i32> @test_masked_z_4xi32_perm_mem_mask3(<4 x i32>* %vp, <4 x i32> % ; GENERIC-LABEL: test_masked_z_4xi32_perm_mem_mask3: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestnmd %xmm0, %xmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [6:1.00] +; GENERIC-NEXT: vpshufd {{.*#+}} xmm0 {%k1} {z} = mem[1,0,1,0] sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: test_masked_z_4xi32_perm_mem_mask3: diff --git a/llvm/test/CodeGen/X86/xop-schedule.ll b/llvm/test/CodeGen/X86/xop-schedule.ll index 7179cd4c4ac..9a314e2327b 100644 --- a/llvm/test/CodeGen/X86/xop-schedule.ll +++ b/llvm/test/CodeGen/X86/xop-schedule.ll @@ -101,9 +101,9 @@ define void @test_vpcmov_128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2, <2 x i ; GENERIC-LABEL: test_vpcmov_128: ; GENERIC: # %bb.0: ; GENERIC-NEXT: #APP -; GENERIC-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00] -; GENERIC-NEXT: vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [6:1.00] -; GENERIC-NEXT: vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [6:1.00] +; GENERIC-NEXT: vpcmov %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50] +; GENERIC-NEXT: vpcmov (%rdi), %xmm1, %xmm0, %xmm0 # sched: [7:0.50] +; GENERIC-NEXT: vpcmov %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:0.50] ; GENERIC-NEXT: #NO_APP ; GENERIC-NEXT: retq # sched: [1:1.00] ; |

