diff options
| author | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-05-11 14:30:54 +0000 |
|---|---|---|
| committer | Simon Pilgrim <llvm-dev@redking.me.uk> | 2018-05-11 14:30:54 +0000 |
| commit | 22dd72b99556f451b56ca3f9b71607be5fd5968f (patch) | |
| tree | ac45d76c6d74c61d8186c578c4e3f9ac8bfb28ce | |
| parent | 8f30ec65b0a6b064235d134344c2503a1ef43d14 (diff) | |
| download | bcm5719-llvm-22dd72b99556f451b56ca3f9b71607be5fd5968f.tar.gz bcm5719-llvm-22dd72b99556f451b56ca3f9b71607be5fd5968f.zip | |
[X86] Split WriteF/WriteVec Move/Load/Store scheduler classes by vector width
Fixes a SNB issue that was missing vlddqu/vmovntdqa ymm instructions
llvm-svn: 332094
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedBroadwell.td | 20 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedHaswell.td | 28 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSandyBridge.td | 24 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86SchedSkylakeClient.td | 24 | ||||
| -rwxr-xr-x | llvm/lib/Target/X86/X86SchedSkylakeServer.td | 45 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86Schedule.td | 20 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleAtom.td | 19 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleBtVer2.td | 13 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleSLM.td | 12 | ||||
| -rw-r--r-- | llvm/lib/Target/X86/X86ScheduleZnver1.td | 12 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx-schedule.ll | 4 | ||||
| -rw-r--r-- | llvm/test/CodeGen/X86/avx2-schedule.ll | 2 | ||||
| -rwxr-xr-x | llvm/test/CodeGen/X86/avx512-schedule.ll | 64 | ||||
| -rwxr-xr-x | llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll | 220 | ||||
| -rw-r--r-- | llvm/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s | 4 |
15 files changed, 284 insertions, 227 deletions
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td index 175b0956231..cae21d60482 100755 --- a/llvm/lib/Target/X86/X86SchedBroadwell.td +++ b/llvm/lib/Target/X86/X86SchedBroadwell.td @@ -162,12 +162,18 @@ defm : BWWriteResPair<WriteJump, [BWPort06], 1>; // Floating point. This covers both scalar and vector operations. defm : X86WriteRes<WriteFLoad, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [BWPort23], 6, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>; defm : X86WriteRes<WriteFMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>; defm : X86WriteRes<WriteFStore, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; defm : BWWriteResPair<WriteFAdd, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub. defm : BWWriteResPair<WriteFAddX, [BWPort1], 3, [1], 1, 5>; // Floating point add/sub (XMM). @@ -256,12 +262,18 @@ def : WriteRes<WriteCvtF2FSt, [BWPort1,BWPort4,BWPort237]> { // Vector integer operations. defm : X86WriteRes<WriteVecLoad, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [BWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [BWPort23], 6, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [BWPort23,BWPort5], 7, [1,2], 3>; defm : X86WriteRes<WriteVecMaskedLoadY, [BWPort23,BWPort5], 8, [1,2], 3>; defm : X86WriteRes<WriteVecStore, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [BWPort237,BWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMove, [BWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [BWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [BWPort015], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [BWPort01,BWPort15,BWPort015,BWPort0156], 31, [8,1,21,1], 31>; defm : BWWriteResPair<WriteVecALU, [BWPort15], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. @@ -935,17 +947,9 @@ def: InstRW<[BWWriteResGroup58], (instregex "LD_F(32|64|80)m", "VBROADCASTI128", "VBROADCASTSDYrm", "VBROADCASTSSYrm", - "VLDDQUYrm", - "VMOVAPDYrm", - "VMOVAPSYrm", "VMOVDDUPYrm", - "VMOVDQAYrm", - "VMOVDQUYrm", - "VMOVNTDQAYrm", "VMOVSHDUPYrm", "VMOVSLDUPYrm", - "VMOVUPDYrm", - "VMOVUPSYrm", "VPBROADCASTDYrm", "VPBROADCASTQYrm")>; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 2a08e2255d1..e6c35444cac 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -153,12 +153,18 @@ defm : HWWriteResPair<WriteIDiv64, [HWPort0, HWDivider], 25, [1,10], 1, 4>; // Scalar and vector floating point. defm : X86WriteRes<WriteFLoad, [HWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [HWPort23], 7, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>; defm : X86WriteRes<WriteFMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>; defm : X86WriteRes<WriteFStore, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [HWPort01,HWPort15,HWPort015,HWPort0156], 31, [8,1,21,1], 31>; defm : HWWriteResPair<WriteFAdd, [HWPort1], 3, [1], 1, 5>; @@ -248,12 +254,18 @@ def : WriteRes<WriteCvtF2FSt, [HWPort1,HWPort4,HWPort5,HWPort237]> { // Vector integer operations. defm : X86WriteRes<WriteVecLoad, [HWPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [HWPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [HWPort23], 7, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [HWPort23,HWPort5], 8, [1,2], 3>; defm : X86WriteRes<WriteVecMaskedLoadY, [HWPort23,HWPort5], 9, [1,2], 3>; defm : X86WriteRes<WriteVecStore, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [HWPort237,HWPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteVecMove, [HWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [HWPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [HWPort015], 1, [1], 1>; defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1, [1], 1, 5>; defm : HWWriteResPair<WriteVecLogicX,[HWPort015], 1, [1], 1, 6>; @@ -703,16 +715,8 @@ def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> { let ResourceCycles = [1]; } def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm", - "(V?)LDDQUrm", - "(V?)MOVAPDrm", - "(V?)MOVAPSrm", - "(V?)MOVDQArm", - "(V?)MOVDQUrm", - "(V?)MOVNTDQArm", "(V?)MOVSHDUPrm", "(V?)MOVSLDUPrm", - "(V?)MOVUPDrm", - "(V?)MOVUPSrm", "VPBROADCAST(D|Q)rm")>; def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> { @@ -725,17 +729,9 @@ def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F(32|64|80)m", "VBROADCASTI128", "VBROADCASTSDYrm", "VBROADCASTSSYrm", - "VLDDQUYrm", - "VMOVAPDYrm", - "VMOVAPSYrm", "VMOVDDUPYrm", - "VMOVDQAYrm", - "VMOVDQUYrm", - "VMOVNTDQAYrm", "VMOVSHDUPYrm", "VMOVSLDUPYrm", - "VMOVUPDYrm", - "VMOVUPSYrm", "VPBROADCAST(D|Q)Yrm")>; def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> { diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index 282ebbe8400..74a5824a036 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -144,13 +144,19 @@ defm : SBWriteResPair<WriteBEXTR, [SBPort05,SBPort1], 2, [1,1], 2>; defm : SBWriteResPair<WriteBZHI, [SBPort1], 1>; // Scalar and vector floating point. -defm : X86WriteRes<WriteFLoad, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoad, [SBPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SBPort23], 7, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>; defm : X86WriteRes<WriteFMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>; defm : X86WriteRes<WriteFStore, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreX, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [SBPort015], 31, [31], 31>; defm : SBWriteResPair<WriteFAdd, [SBPort1], 3, [1], 1, 6>; @@ -227,13 +233,19 @@ defm : SBWriteResPair<WriteFVarBlendY,[SBPort05], 2, [2], 2, 7>; def : WriteRes<WriteCvtF2FSt, [SBPort1, SBPort23, SBPort4]> { let Latency = 4; } // Vector integer operations. -defm : X86WriteRes<WriteVecLoad, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoad, [SBPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SBPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SBPort23], 7, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [SBPort23,SBPort05], 8, [1,2], 3>; defm : X86WriteRes<WriteVecMaskedLoadY, [SBPort23,SBPort05], 9, [1,2], 3>; defm : X86WriteRes<WriteVecStore, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreX, [SBPort23,SBPort4], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteVecMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteVecMove, [SBPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SBPort05], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SBPort05], 1, [1], 1>; defm : SBWriteResPair<WriteVecLogic, [SBPort015], 1, [1], 1, 5>; defm : SBWriteResPair<WriteVecLogicX,[SBPort015], 1, [1], 1, 6>; @@ -873,15 +885,9 @@ def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> { } def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm", "VBROADCASTSSYrm", - "VMOVAPDYrm", - "VMOVAPSYrm", "VMOVDDUPYrm", - "VMOVDQAYrm", - "VMOVDQUYrm", "VMOVSHDUPYrm", - "VMOVSLDUPYrm", - "VMOVUPDYrm", - "VMOVUPSYrm")>; + "VMOVSLDUPYrm")>; def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> { let Latency = 7; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index 8e8dee3e61a..20e1a71e5e7 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -157,13 +157,19 @@ def : WriteRes<WriteZero, []>; defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>; // Floating point. This covers both scalar and vector operations. -defm : X86WriteRes<WriteFLoad, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoad, [SKLPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SKLPort23], 7, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>; defm : X86WriteRes<WriteFMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>; defm : X86WriteRes<WriteFStore, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [SKLPort05,SKLPort0156], 10, [9,1], 10>; defm : SKLWriteResPair<WriteFAdd, [SKLPort01], 4, [1], 1, 5>; // Floating point add/sub. @@ -248,13 +254,19 @@ def : WriteRes<WriteCvtF2FSt, [SKLPort4,SKLPort5,SKLPort237,SKLPort01]> { // class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } // Vector integer operations. -defm : X86WriteRes<WriteVecLoad, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoad, [SKLPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SKLPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SKLPort23], 7, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [SKLPort23,SKLPort015], 7, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedLoadY, [SKLPort23,SKLPort015], 8, [1,1], 2>; defm : X86WriteRes<WriteVecStore, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [SKLPort237,SKLPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SKLPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SKLPort015], 1, [1], 1>; defm : SKLWriteResPair<WriteVecALU, [SKLPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. defm : SKLWriteResPair<WriteVecALUX, [SKLPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (XMM). @@ -1111,17 +1123,9 @@ def: InstRW<[SKLWriteResGroup85], (instregex "LD_F(32|64|80)m", "VBROADCASTI128", "VBROADCASTSDYrm", "VBROADCASTSSYrm", - "VLDDQUYrm", - "VMOVAPDYrm", - "VMOVAPSYrm", "VMOVDDUPYrm", - "VMOVDQAYrm", - "VMOVDQUYrm", - "VMOVNTDQAYrm", "VMOVSHDUPYrm", "VMOVSLDUPYrm", - "VMOVUPDYrm", - "VMOVUPSYrm", "VPBROADCASTDYrm", "VPBROADCASTQYrm")>; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index e6f189f4ede..592dcf2d303 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -158,12 +158,18 @@ defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>; // Floating point. This covers both scalar and vector operations. defm : X86WriteRes<WriteFLoad, [SKXPort23], 5, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [SKXPort23], 6, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [SKXPort23], 7, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>; defm : X86WriteRes<WriteFMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>; defm : X86WriteRes<WriteFStore, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [SKXPort05,SKXPort0156], 10, [9,1], 10>; defm : SKXWriteResPair<WriteFAdd, [SKXPort015], 4, [1], 1, 5>; // Floating point add/sub. @@ -249,12 +255,18 @@ def : WriteRes<WriteCvtF2FSt, [SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { // Vector integer operations. defm : X86WriteRes<WriteVecLoad, [SKXPort23], 5, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [SKXPort23], 6, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [SKXPort23], 7, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [SKXPort23,SKXPort015], 7, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedLoadY, [SKXPort23,SKXPort015], 8, [1,1], 2>; defm : X86WriteRes<WriteVecStore, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreX, [SKXPort237,SKXPort4], 1, [1,1], 2>; +defm : X86WriteRes<WriteVecStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteVecMove, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [SKXPort015], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [SKXPort015], 1, [1], 1>; defm : SKXWriteResPair<WriteVecALU, [SKXPort05], 1, [1], 1, 5>; // Vector integer ALU op, no logicals. defm : SKXWriteResPair<WriteVecALUX, [SKXPort01], 1, [1], 1, 6>; // Vector integer ALU op, no logicals (XMM). @@ -1139,27 +1151,9 @@ def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SKXWriteResGroup71], (instregex "LDDQUrm", - "MOVAPDrm", - "MOVAPSrm", - "MOVDQArm", - "MOVDQUrm", - "MOVNTDQArm", - "MOVSHDUPrm", - "MOVSLDUPrm", - "MOVUPDrm", - "MOVUPSrm", - "VBROADCASTSSrm", - "VLDDQUrm", - "VMOVAPDrm", - "VMOVAPSrm", - "VMOVDQArm", - "VMOVDQUrm", - "VMOVNTDQArm", - "VMOVSHDUPrm", - "VMOVSLDUPrm", - "VMOVUPDrm", - "VMOVUPSrm", +def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm", + "(V?)MOVSHDUPrm", + "(V?)MOVSLDUPrm", "VPBROADCASTDrm", "VPBROADCASTQrm")>; @@ -1331,18 +1325,9 @@ def: InstRW<[SKXWriteResGroup89], (instregex "LD_F(32|64|80)m", "VBROADCASTI128", "VBROADCASTSDYrm", "VBROADCASTSSYrm", - "VLDDQUYrm", - "VMOVAPDYrm", - "VMOVAPSYrm", "VMOVDDUPYrm", - "VMOVDQAYrm", - "VMOVDQUYrm", - "VMOVNTDQAYrm", - "VMOVNTDQAZrm(b?)", "VMOVSHDUPYrm", "VMOVSLDUPYrm", - "VMOVUPDYrm", - "VMOVUPSYrm", "VPBROADCASTDYrm", "VPBROADCASTQYrm")>; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index e974c3a6c5a..f23b37ddce7 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -137,12 +137,18 @@ defm WriteJump : X86SchedWritePair; // Floating point. This covers both scalar and vector operations. def WriteFLoad : SchedWrite; +def WriteFLoadX : SchedWrite; +def WriteFLoadY : SchedWrite; def WriteFMaskedLoad : SchedWrite; def WriteFMaskedLoadY : SchedWrite; def WriteFStore : SchedWrite; +def WriteFStoreX : SchedWrite; +def WriteFStoreY : SchedWrite; def WriteFMaskedStore : SchedWrite; def WriteFMaskedStoreY : SchedWrite; def WriteFMove : SchedWrite; +def WriteFMoveX : SchedWrite; +def WriteFMoveY : SchedWrite; defm WriteFAdd : X86SchedWritePair; // Floating point add/sub. defm WriteFAddX : X86SchedWritePair; // Floating point add/sub (XMM). @@ -220,12 +226,18 @@ defm WritePHAddY : X86SchedWritePair; // YMM/ZMM. // Vector integer operations. def WriteVecLoad : SchedWrite; +def WriteVecLoadX : SchedWrite; +def WriteVecLoadY : SchedWrite; def WriteVecMaskedLoad : SchedWrite; def WriteVecMaskedLoadY : SchedWrite; def WriteVecStore : SchedWrite; +def WriteVecStoreX : SchedWrite; +def WriteVecStoreY : SchedWrite; def WriteVecMaskedStore : SchedWrite; def WriteVecMaskedStoreY : SchedWrite; def WriteVecMove : SchedWrite; +def WriteVecMoveX : SchedWrite; +def WriteVecMoveY : SchedWrite; defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals. defm WriteVecALUX : X86SchedWritePair; // Vector integer ALU op, no logicals (XMM). @@ -332,9 +344,9 @@ def WriteNop : SchedWrite; def WriteFMoveLS : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>; def WriteFMoveLSX - : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>; + : X86SchedWriteMoveLS<WriteFMoveX, WriteFLoadX, WriteFStoreX>; def WriteFMoveLSY - : X86SchedWriteMoveLS<WriteFMove, WriteFLoad, WriteFStore>; + : X86SchedWriteMoveLS<WriteFMoveY, WriteFLoadY, WriteFStoreY>; def SchedWriteFMoveLS : X86SchedWriteMoveLSWidths<WriteFMoveLS, WriteFMoveLSX, WriteFMoveLSY, WriteFMoveLSY>; @@ -342,9 +354,9 @@ def SchedWriteFMoveLS def WriteVecMoveLS : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>; def WriteVecMoveLSX - : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>; + : X86SchedWriteMoveLS<WriteVecMoveX, WriteVecLoadX, WriteVecStoreX>; def WriteVecMoveLSY - : X86SchedWriteMoveLS<WriteVecMove, WriteVecLoad, WriteVecStore>; + : X86SchedWriteMoveLS<WriteVecMoveY, WriteVecLoadY, WriteVecStoreY>; def SchedWriteVecMoveLS : X86SchedWriteMoveLSWidths<WriteVecMoveLS, WriteVecMoveLSX, WriteVecMoveLSY, WriteVecMoveLSY>; diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td index 00721ec3057..e81bb3605bf 100644 --- a/llvm/lib/Target/X86/X86ScheduleAtom.td +++ b/llvm/lib/Target/X86/X86ScheduleAtom.td @@ -181,15 +181,22 @@ def : WriteRes<WriteNop, [AtomPort01]>; //////////////////////////////////////////////////////////////////////////////// def : WriteRes<WriteFLoad, [AtomPort0]>; +def : WriteRes<WriteFLoadX, [AtomPort0]>; +def : WriteRes<WriteFLoadY, [AtomPort0]>; def : WriteRes<WriteFMaskedLoad, [AtomPort0]>; def : WriteRes<WriteFMaskedLoadY, [AtomPort0]>; def : WriteRes<WriteFStore, [AtomPort0]>; +def : WriteRes<WriteFStoreX, [AtomPort0]>; +def : WriteRes<WriteFStoreY, [AtomPort0]>; def : WriteRes<WriteFMaskedStore, [AtomPort0]>; def : WriteRes<WriteFMaskedStoreY, [AtomPort0]>; -def : WriteRes<WriteFMove, [AtomPort01]>; -defm : X86WriteRes<WriteEMMS,[AtomPort01], 5, [5], 1>; +def : WriteRes<WriteFMove, [AtomPort01]>; +def : WriteRes<WriteFMoveX, [AtomPort01]>; +def : WriteRes<WriteFMoveY, [AtomPort01]>; + +defm : X86WriteRes<WriteEMMS, [AtomPort01], 5, [5], 1>; defm : AtomWriteResPair<WriteFAdd, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; defm : AtomWriteResPair<WriteFAddX, [AtomPort0], [AtomPort0], 5, 5, [5], [5]>; @@ -271,14 +278,20 @@ def : WriteRes<WriteCvtF2FSt, [AtomPort0]>; // NOTE: Doesn't exist on Atom. //////////////////////////////////////////////////////////////////////////////// def : WriteRes<WriteVecLoad, [AtomPort0]>; +def : WriteRes<WriteVecLoadX, [AtomPort0]>; +def : WriteRes<WriteVecLoadY, [AtomPort0]>; def : WriteRes<WriteVecMaskedLoad, [AtomPort0]>; def : WriteRes<WriteVecMaskedLoadY, [AtomPort0]>; def : WriteRes<WriteVecStore, [AtomPort0]>; +def : WriteRes<WriteVecStoreX, [AtomPort0]>; +def : WriteRes<WriteVecStoreY, [AtomPort0]>; def : WriteRes<WriteVecMaskedStore, [AtomPort0]>; def : WriteRes<WriteVecMaskedStoreY, [AtomPort0]>; -def : WriteRes<WriteVecMove, [AtomPort01]>; +def : WriteRes<WriteVecMove, [AtomPort01]>; +def : WriteRes<WriteVecMoveX, [AtomPort01]>; +def : WriteRes<WriteVecMoveY, [AtomPort01]>; defm : AtomWriteResPair<WriteVecALU, [AtomPort01], [AtomPort0], 1, 1>; defm : AtomWriteResPair<WriteVecALUX, [AtomPort01], [AtomPort0], 1, 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index 18d4d324888..6ae735fb913 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -268,14 +268,21 @@ def : WriteRes<WriteNop, [JALU01]> { let Latency = 1; } //////////////////////////////////////////////////////////////////////////////// defm : X86WriteRes<WriteFLoad, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadX, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFLoadY, [JLAGU, JFPU01, JFPX], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [JLAGU, JFPU01, JFPX], 6, [1, 1, 2], 1>; defm : X86WriteRes<WriteFMaskedLoadY, [JLAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; defm : X86WriteRes<WriteFStore, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>; defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; def : WriteRes<WriteFMove, [JFPU01, JFPX]>; +def : WriteRes<WriteFMoveX, [JFPU01, JFPX]>; +def : WriteRes<WriteFMoveY, [JFPU01, JFPX]>; + def : WriteRes<WriteEMMS, [JFPU01, JFPX]> { let Latency = 2; } defm : JWriteResFpuPair<WriteFAdd, [JFPU0, JFPA], 3>; @@ -397,14 +404,20 @@ def : InstRW<[JWriteCVTSI2FLd], (instregex "(V)?CVTSI(64)?2S(D|S)rm")>; //////////////////////////////////////////////////////////////////////////////// defm : X86WriteRes<WriteVecLoad, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadX, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecLoadY, [JLAGU, JFPU01, JVALU], 5, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [JLAGU, JFPU01, JVALU], 6, [1, 1, 2], 1>; defm : X86WriteRes<WriteVecMaskedLoadY, [JLAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; defm : X86WriteRes<WriteVecStore, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreX, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; +defm : X86WriteRes<WriteVecStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>; defm : X86WriteRes<WriteVecMaskedStore, [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>; defm : X86WriteRes<WriteVecMaskedStoreY, [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>; def : WriteRes<WriteVecMove, [JFPU01, JVALU]>; +def : WriteRes<WriteVecMoveX, [JFPU01, JVALU]>; +def : WriteRes<WriteVecMoveY, [JFPU01, JVALU]>; defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>; defm : JWriteResFpuPair<WriteVecALUX, [JFPU01, JVALU], 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index ef5a0f3551b..12994063007 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -133,12 +133,18 @@ defm : SLMWriteResPair<WriteIDiv64, [SLM_IEC_RSV01, SLMDivider], 25, [1,25], 1, // Scalar and vector floating point. def : WriteRes<WriteFLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFLoadX, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteFLoadY, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteFMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteFMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteFStore, [SLM_FPC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreX, [SLM_FPC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteFStoreY, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteFMaskedStore, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteFMaskedStoreY, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; +def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; +def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; defm : X86WriteRes<WriteEMMS, [SLM_FPC_RSV01], 10, [10], 9>; defm : SLMWriteResPair<WriteFAdd, [SLM_FPC_RSV1], 3>; @@ -205,12 +211,18 @@ def : WriteRes<WriteCvtF2FSt, [SLM_FPC_RSV01, SLM_MEC_RSV]>; // Vector integer operations. def : WriteRes<WriteVecLoad, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadX, [SLM_MEC_RSV]> { let Latency = 3; } +def : WriteRes<WriteVecLoadY, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteVecMaskedLoad, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteVecMaskedLoadY, [SLM_MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteVecStore, [SLM_FPC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreX, [SLM_FPC_RSV01, SLM_MEC_RSV]>; +def : WriteRes<WriteVecStoreY, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteVecMaskedStore, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteVecMaskedStoreY, [SLM_FPC_RSV01, SLM_MEC_RSV]>; def : WriteRes<WriteVecMove, [SLM_FPC_RSV01]>; +def : WriteRes<WriteVecMoveX, [SLM_FPC_RSV01]>; +def : WriteRes<WriteVecMoveY, [SLM_FPC_RSV01]>; defm : SLMWriteResPair<WriteVecShift, [SLM_FPC_RSV0], 1>; defm : SLMWriteResPair<WriteVecShiftX, [SLM_FPC_RSV0], 1>; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 838b1090955..9af8373f549 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -190,12 +190,18 @@ def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{ // Floating point operations defm : X86WriteRes<WriteFLoad, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadX, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteFLoadY, [ZnAGU], 8, [1], 1>; defm : X86WriteRes<WriteFMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,1], 1>; defm : X86WriteRes<WriteFMaskedLoadY, [ZnAGU,ZnFPU01], 8, [1,2], 2>; defm : X86WriteRes<WriteFStore, [ZnAGU], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreX, [ZnAGU], 1, [1,1], 1>; +defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1,1], 1>; defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>; defm : ZnWriteResFpuPair<WriteFAddX, [ZnFPU0], 3>; @@ -266,12 +272,18 @@ def : WriteRes<WriteCvtF2FSt, [ZnFPU3, ZnAGU]>; // Vector integer operations which uses FPU units defm : X86WriteRes<WriteVecLoad, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadX, [ZnAGU], 8, [1], 1>; +defm : X86WriteRes<WriteVecLoadY, [ZnAGU], 8, [1], 1>; defm : X86WriteRes<WriteVecMaskedLoad, [ZnAGU,ZnFPU01], 8, [1,2], 2>; defm : X86WriteRes<WriteVecMaskedLoadY, [ZnAGU,ZnFPU01], 9, [1,3], 2>; defm : X86WriteRes<WriteVecStore, [ZnAGU], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreX, [ZnAGU], 1, [1,1], 1>; +defm : X86WriteRes<WriteVecStoreY, [ZnAGU], 1, [1,1], 1>; defm : X86WriteRes<WriteVecMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>; defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteVecMove, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveX, [ZnFPU], 1, [1], 1>; +defm : X86WriteRes<WriteVecMoveY, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteEMMS, [ZnFPU], 2, [1], 1>; defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>; diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll index 54ed9a0a171..12acd112c49 100644 --- a/llvm/test/CodeGen/X86/avx-schedule.ll +++ b/llvm/test/CodeGen/X86/avx-schedule.ll @@ -2007,12 +2007,12 @@ define <8 x float> @test_insertf128(<8 x float> %a0, <4 x float> %a1, <4 x float define <32 x i8> @test_lddqu(i8* %a0) { ; GENERIC-LABEL: test_lddqu: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50] +; GENERIC-NEXT: vlddqu (%rdi), %ymm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SANDY-LABEL: test_lddqu: ; SANDY: # %bb.0: -; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [6:0.50] +; SANDY-NEXT: vlddqu (%rdi), %ymm0 # sched: [7:0.50] ; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_lddqu: diff --git a/llvm/test/CodeGen/X86/avx2-schedule.ll b/llvm/test/CodeGen/X86/avx2-schedule.ll index 26c318e478c..a27a4118263 100644 --- a/llvm/test/CodeGen/X86/avx2-schedule.ll +++ b/llvm/test/CodeGen/X86/avx2-schedule.ll @@ -573,7 +573,7 @@ define <8 x i32> @test_inserti128(<8 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) define <4 x i64> @test_movntdqa(i8* %a0) { ; GENERIC-LABEL: test_movntdqa: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; HASWELL-LABEL: test_movntdqa: diff --git a/llvm/test/CodeGen/X86/avx512-schedule.ll b/llvm/test/CodeGen/X86/avx512-schedule.ll index 673d6b323d5..92f83963b34 100755 --- a/llvm/test/CodeGen/X86/avx512-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-schedule.ll @@ -4641,7 +4641,7 @@ define <64 x i8> @zext_64xi1_to_64xi8(<64 x i8> %x, <64 x i8> %y) #0 { ; GENERIC-LABEL: zext_64xi1_to_64xi8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqb %zmm1, %zmm0, %k1 # sched: [1:0.50] -; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_64xi1_to_64xi8: @@ -4695,7 +4695,7 @@ define <32 x i8> @zext_32xi1_to_32xi8(<32 x i16> %x, <32 x i16> %y) #0 { ; GENERIC-LABEL: zext_32xi1_to_32xi8: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vpcmpeqw %zmm1, %zmm0, %k1 # sched: [1:0.50] -; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu8 {{.*}}(%rip), %ymm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: zext_32xi1_to_32xi8: @@ -6093,7 +6093,7 @@ define <4 x i32> @mov_test15(i32* %x) { define <16 x i32> @mov_test16(i8 * %addr) { ; GENERIC-LABEL: mov_test16: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test16: @@ -6108,7 +6108,7 @@ define <16 x i32> @mov_test16(i8 * %addr) { define <16 x i32> @mov_test17(i8 * %addr) { ; GENERIC-LABEL: mov_test17: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test17: @@ -6174,7 +6174,7 @@ define void @mov_test20(i8 * %addr, <16 x i32> %data) { define <8 x i64> @mov_test21(i8 * %addr) { ; GENERIC-LABEL: mov_test21: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test21: @@ -6206,7 +6206,7 @@ define void @mov_test22(i8 * %addr, <8 x i64> %data) { define <8 x i64> @mov_test23(i8 * %addr) { ; GENERIC-LABEL: mov_test23: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test23: @@ -6238,7 +6238,7 @@ define void @mov_test24(i8 * %addr, <8 x double> %data) { define <8 x double> @mov_test25(i8 * %addr) { ; GENERIC-LABEL: mov_test25: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test25: @@ -6270,7 +6270,7 @@ define void @mov_test26(i8 * %addr, <16 x float> %data) { define <16 x float> @mov_test27(i8 * %addr) { ; GENERIC-LABEL: mov_test27: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test27: @@ -6302,7 +6302,7 @@ define void @mov_test28(i8 * %addr, <8 x double> %data) { define <8 x double> @mov_test29(i8 * %addr) { ; GENERIC-LABEL: mov_test29: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test29: @@ -6334,7 +6334,7 @@ define void @mov_test30(i8 * %addr, <16 x float> %data) { define <16 x float> @mov_test31(i8 * %addr) { ; GENERIC-LABEL: mov_test31: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test31: @@ -6350,7 +6350,7 @@ define <16 x i32> @mov_test32(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test32: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test32: @@ -6369,7 +6369,7 @@ define <16 x i32> @mov_test33(i8 * %addr, <16 x i32> %old, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test33: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test33: @@ -6388,7 +6388,7 @@ define <16 x i32> @mov_test34(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test34: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa32 (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test34: @@ -6407,7 +6407,7 @@ define <16 x i32> @mov_test35(i8 * %addr, <16 x i32> %mask1) { ; GENERIC-LABEL: mov_test35: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmd %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test35: @@ -6426,7 +6426,7 @@ define <8 x i64> @mov_test36(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test36: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test36: @@ -6445,7 +6445,7 @@ define <8 x i64> @mov_test37(i8 * %addr, <8 x i64> %old, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test37: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmq %zmm1, %zmm1, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test37: @@ -6464,7 +6464,7 @@ define <8 x i64> @mov_test38(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test38: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test38: @@ -6483,7 +6483,7 @@ define <8 x i64> @mov_test39(i8 * %addr, <8 x i64> %mask1) { ; GENERIC-LABEL: mov_test39: ; GENERIC: # %bb.0: ; GENERIC-NEXT: vptestmq %zmm0, %zmm0, %k1 # sched: [1:0.33] -; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovdqu64 (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test39: @@ -6503,7 +6503,7 @@ define <16 x float> @mov_test40(i8 * %addr, <16 x float> %old, <16 x float> %mas ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test40: @@ -6524,7 +6524,7 @@ define <16 x float> @mov_test41(i8 * %addr, <16 x float> %old, <16 x float> %mas ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm2, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test41: @@ -6545,7 +6545,7 @@ define <16 x float> @mov_test42(i8 * %addr, <16 x float> %mask1) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovaps (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test42: @@ -6566,7 +6566,7 @@ define <16 x float> @mov_test43(i8 * %addr, <16 x float> %mask1) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorps %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqps %zmm1, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test43: @@ -6587,7 +6587,7 @@ define <8 x double> @mov_test44(i8 * %addr, <8 x double> %old, <8 x double> %mas ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test44: @@ -6608,7 +6608,7 @@ define <8 x double> @mov_test45(i8 * %addr, <8 x double> %old, <8 x double> %mas ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm2, %xmm2, %xmm2 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm2, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [6:0.50] +; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test45: @@ -6629,7 +6629,7 @@ define <8 x double> @mov_test46(i8 * %addr, <8 x double> %mask1) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovapd (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test46: @@ -6650,7 +6650,7 @@ define <8 x double> @mov_test47(i8 * %addr, <8 x double> %mask1) { ; GENERIC: # %bb.0: ; GENERIC-NEXT: vxorpd %xmm1, %xmm1, %xmm1 # sched: [1:1.00] ; GENERIC-NEXT: vcmpneq_oqpd %zmm1, %zmm0, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovupd (%rdi), %zmm0 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; SKX-LABEL: mov_test47: @@ -7603,9 +7603,9 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) { define void @ktest_1(<8 x double> %in, double * %base) { ; GENERIC-LABEL: ktest_1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [6:0.50] +; GENERIC-NEXT: vmovupd (%rdi), %zmm1 # sched: [7:0.50] ; GENERIC-NEXT: vcmpltpd %zmm0, %zmm1, %k1 # sched: [3:1.00] -; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} # sched: [3:1.00] ; GENERIC-NEXT: kortestb %k0, %k0 # sched: [1:0.33] ; GENERIC-NEXT: je .LBB410_2 # sched: [1:1.00] @@ -7665,13 +7665,13 @@ define void @ktest_2(<32 x float> %in, float * %base) { ; ; GENERIC-LABEL: ktest_2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovups (%rdi), %zmm2 # sched: [6:0.50] -; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [6:0.50] +; GENERIC-NEXT: vmovups (%rdi), %zmm2 # sched: [7:0.50] +; GENERIC-NEXT: vmovups 64(%rdi), %zmm3 # sched: [7:0.50] ; GENERIC-NEXT: vcmpltps %zmm0, %zmm2, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm1, %zmm3, %k2 # sched: [3:1.00] ; GENERIC-NEXT: kunpckwd %k1, %k2, %k0 # sched: [1:1.00] -; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [6:0.50] -; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [6:0.50] +; GENERIC-NEXT: vmovups 68(%rdi), %zmm2 {%k2} {z} # sched: [7:0.50] +; GENERIC-NEXT: vmovups 4(%rdi), %zmm3 {%k1} {z} # sched: [7:0.50] ; GENERIC-NEXT: vcmpltps %zmm3, %zmm0, %k1 # sched: [3:1.00] ; GENERIC-NEXT: vcmpltps %zmm2, %zmm1, %k2 # sched: [3:1.00] ; GENERIC-NEXT: kunpckwd %k1, %k2, %k1 # sched: [1:1.00] diff --git a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll index a210b63ba02..29fe6ca6eb4 100755 --- a/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffle-schedule.ll @@ -401,7 +401,7 @@ define <16 x i16> @test_masked_z_16xi16_perm_mem_mask3(<16 x i16>* %vp, <16 x i1 define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50] ; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -416,7 +416,7 @@ define <32 x i16> @test_32xi16_perm_mask0(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -438,7 +438,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,1,3,31,6,11,23,26,29,5,21,30,1,21,27,10,8,19,14,5,15,13,18,16,9,11,26,8,17,0,23,10] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -457,7 +457,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask0(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -479,7 +479,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,8,7,30,11,9,11,30,20,19,22,12,13,20,0,6,10,7,20,12,28,18,13,12,22,13,21,1,14,8,5,16] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -498,7 +498,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask1(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -520,7 +520,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [15,17,24,28,15,9,14,25,28,25,6,31,20,2,23,31,12,21,10,6,22,0,26,16,3,3,20,27,8,31,3,27] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -539,7 +539,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask2(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { ; GENERIC-LABEL: test_32xi16_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50] ; GENERIC-NEXT: vpermw %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -554,7 +554,7 @@ define <32 x i16> @test_32xi16_perm_mask3(<32 x i16> %vec) { define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -576,7 +576,7 @@ define <32 x i16> @test_masked_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %ve define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [12,2,8,14,25,27,4,16,20,11,27,8,0,1,21,17,30,30,29,1,23,22,20,22,28,20,11,17,6,18,0,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -595,7 +595,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mask3(<32 x i16> %vec, <32 x i16> % define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50] ; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -611,7 +611,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask0(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -632,7 +632,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [19,1,5,31,9,12,17,9,15,7,1,5,16,2,12,10,13,3,29,15,26,31,10,15,22,13,9,23,28,29,20,12] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -653,7 +653,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask0(<32 x i16>* %vp, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -674,7 +674,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [31,20,2,2,23,1,0,12,16,14,15,18,21,13,11,31,8,24,13,11,2,27,22,28,14,21,3,12,6,1,30,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -695,7 +695,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask1(<32 x i16>* %vp, <32 x i1 define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -716,7 +716,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [4,6,12,17,4,31,31,4,12,21,28,15,29,10,15,15,21,6,19,7,10,30,28,26,1,4,8,25,26,18,22,25] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -737,7 +737,7 @@ define <32 x i16> @test_masked_z_32xi16_perm_mem_mask2(<32 x i16>* %vp, <32 x i1 define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) { ; GENERIC-LABEL: test_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm0 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50] ; GENERIC-NEXT: vpermw (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -753,7 +753,7 @@ define <32 x i16> @test_32xi16_perm_mem_mask3(<32 x i16>* %vp) { define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %vec2, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -774,7 +774,7 @@ define <32 x i16> @test_masked_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> define <32 x i16> @test_masked_z_32xi16_perm_mem_mask3(<32 x i16>* %vp, <32 x i16> %mask) { ; GENERIC-LABEL: test_masked_z_32xi16_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [2,2,27,1,7,1,0,27,10,5,4,20,30,16,28,16,18,21,25,24,31,23,28,6,17,19,26,15,25,12,18,27] sched: [7:0.50] ; GENERIC-NEXT: vptestnmw %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermw (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1189,7 +1189,7 @@ define <8 x i32> @test_masked_z_8xi32_perm_mem_mask3(<8 x i32>* %vp, <8 x i32> % define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_16xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50] ; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1204,7 +1204,7 @@ define <16 x i32> @test_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -1226,7 +1226,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [14,12,11,6,4,1,6,9,14,14,6,1,12,11,0,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1245,7 +1245,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask0(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -1267,7 +1267,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [10,0,14,15,11,1,1,5,0,5,0,15,13,1,14,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1286,7 +1286,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask1(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -1308,7 +1308,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [3,10,15,1,0,5,0,9,13,2,1,5,15,2,15,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1327,7 +1327,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask2(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { ; GENERIC-LABEL: test_16xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50] ; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1342,7 +1342,7 @@ define <16 x i32> @test_16xi32_perm_mask3(<16 x i32> %vec) { define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -1364,7 +1364,7 @@ define <16 x i32> @test_masked_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %ve define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,4,14,15,10,2,15,1,9,2,14,15,12,5,3,12] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1383,7 +1383,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mask3(<16 x i32> %vec, <16 x i32> % define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { ; GENERIC-LABEL: test_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50] ; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1399,7 +1399,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask0(<16 x i32>* %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1420,7 +1420,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,1,1,6,8,11,2,6,10,1,7,5,15,0,6,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1441,7 +1441,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask0(<16 x i32>* %vp, <16 x i3 define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1462,7 +1462,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,5,3,4,7,15,12,4,8,11,12,7,6,12,6,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1483,7 +1483,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask1(<16 x i32>* %vp, <16 x i3 define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1504,7 +1504,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,14,2,7,10,7,3,0,11,9,0,4,12,10,8,2] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1525,7 +1525,7 @@ define <16 x i32> @test_masked_z_16xi32_perm_mem_mask2(<16 x i32>* %vp, <16 x i3 define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { ; GENERIC-LABEL: test_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50] ; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1541,7 +1541,7 @@ define <16 x i32> @test_16xi32_perm_mem_mask3(<16 x i32>* %vp) { define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1562,7 +1562,7 @@ define <16 x i32> @test_masked_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> define <16 x i32> @test_masked_z_16xi32_perm_mem_mask3(<16 x i32>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xi32_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [11,7,10,12,3,12,4,15,1,14,0,4,8,9,6,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -1937,7 +1937,7 @@ define <4 x i64> @test_masked_z_4xi64_perm_mem_mask3(<4 x i64>* %vp, <4 x i64> % define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { ; GENERIC-LABEL: test_8xi64_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [0,4,7,6,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -1952,7 +1952,7 @@ define <8 x i64> @test_8xi64_perm_mask0(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,4,7,6,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -1974,7 +1974,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask0(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,4,7,6,5,5,1,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2030,7 +2030,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask1(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,3,7,3,3,5,4,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -2052,7 +2052,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask2(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,3,7,3,3,5,4,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2121,7 +2121,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask3(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [6,3,1,1,7,4,0,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -2143,7 +2143,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask4(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [6,3,1,1,7,4,0,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2199,7 +2199,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask5(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { ; GENERIC-LABEL: test_8xi64_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,1,4,4,5,4,2,7] sched: [7:0.50] ; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2214,7 +2214,7 @@ define <8 x i64> @test_8xi64_perm_mask6(<8 x i64> %vec) { define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm3 = [5,1,4,4,5,4,2,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovdqa64 %zmm1, %zmm0 # sched: [1:0.50] @@ -2236,7 +2236,7 @@ define <8 x i64> @test_masked_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %vec2, define <8 x i64> @test_masked_z_8xi64_perm_mask6(<8 x i64> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,4,4,5,4,2,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2292,7 +2292,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mask7(<8 x i64> %vec, <8 x i64> % define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [5,1,6,5,7,3,7,3] sched: [7:0.50] ; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2308,7 +2308,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask0(<8 x i64>* %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,1,6,5,7,3,7,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2329,7 +2329,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_mem_mask0(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,1,6,5,7,3,7,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2388,7 +2388,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask1(<8 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,2,1,4,1,1,5,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2409,7 +2409,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_mem_mask2(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,1,4,1,1,5,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2482,7 +2482,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask3(<8 x i64>* %vp, <8 x i6 define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [5,0,7,0,3,5,0,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2503,7 +2503,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_mem_mask4(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [5,0,7,0,3,5,0,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2562,7 +2562,7 @@ define <8 x i64> @test_masked_z_8xi64_perm_imm_mem_mask5(<8 x i64>* %vp, <8 x i6 define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { ; GENERIC-LABEL: test_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,6,3,7,3,0,3,6] sched: [7:0.50] ; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -2578,7 +2578,7 @@ define <8 x i64> @test_8xi64_perm_mem_mask6(<8 x i64>* %vp) { define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,6,3,7,3,0,3,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -2599,7 +2599,7 @@ define <8 x i64> @test_masked_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %ve define <8 x i64> @test_masked_z_8xi64_perm_mem_mask6(<8 x i64>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xi64_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,6,3,7,3,0,3,6] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermq (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3052,7 +3052,7 @@ define <8 x float> @test_masked_z_8xfloat_perm_mem_mask3(<8 x float>* %vp, <8 x define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { ; GENERIC-LABEL: test_16xfloat_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50] ; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3067,7 +3067,7 @@ define <16 x float> @test_16xfloat_perm_mask0(<16 x float> %vec) { define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] @@ -3089,7 +3089,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask0(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,7,5,13,4,9,11,13,12,6,0,0,11,15,5,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3108,7 +3108,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask0(<16 x float> %vec, <16 x define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] @@ -3130,7 +3130,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask1(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [11,10,4,10,4,5,8,11,2,0,10,0,0,3,10,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3149,7 +3149,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask1(<16 x float> %vec, <16 x define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] @@ -3171,7 +3171,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask2(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [0,15,6,14,3,6,5,2,5,15,11,6,6,4,8,11] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3190,7 +3190,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask2(<16 x float> %vec, <16 x define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { ; GENERIC-LABEL: test_16xfloat_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50] ; GENERIC-NEXT: vpermps %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3205,7 +3205,7 @@ define <16 x float> @test_16xfloat_perm_mask3(<16 x float> %vec) { define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm3 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovaps %zmm1, %zmm0 # sched: [1:1.00] @@ -3227,7 +3227,7 @@ define <16 x float> @test_masked_16xfloat_perm_mask3(<16 x float> %vec, <16 x fl define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,0,14,6,6,0,2,13,8,11,2,5,13,13,3] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3246,7 +3246,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mask3(<16 x float> %vec, <16 x define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { ; GENERIC-LABEL: test_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50] ; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3262,7 +3262,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask0(<16 x float>* %vp) { define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3283,7 +3283,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,2,1,14,9,9,7,2,9,4,12,11,0,14,0,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3304,7 +3304,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask0(<16 x float>* %vp, <1 define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3325,7 +3325,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [4,2,3,5,11,6,4,7,6,4,14,8,15,12,9,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3346,7 +3346,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask1(<16 x float>* %vp, <1 define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3367,7 +3367,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [10,7,11,6,7,0,11,0,10,9,12,4,10,3,8,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3388,7 +3388,7 @@ define <16 x float> @test_masked_z_16xfloat_perm_mem_mask2(<16 x float>* %vp, <1 define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { ; GENERIC-LABEL: test_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50] ; GENERIC-NEXT: vpermps (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3404,7 +3404,7 @@ define <16 x float> @test_16xfloat_perm_mem_mask3(<16 x float>* %vp) { define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x float> %vec2, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm2 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3425,7 +3425,7 @@ define <16 x float> @test_masked_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 define <16 x float> @test_masked_z_16xfloat_perm_mem_mask3(<16 x float>* %vp, <16 x i32> %mask) { ; GENERIC-LABEL: test_masked_z_16xfloat_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [15,15,3,9,5,15,14,9,11,10,5,14,14,5,11,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmd %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermps (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3800,7 +3800,7 @@ define <4 x double> @test_masked_z_4xdouble_perm_mem_mask3(<4 x double>* %vp, <4 define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { ; GENERIC-LABEL: test_8xdouble_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [5,7,4,2,7,4,3,4] sched: [7:0.50] ; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -3815,7 +3815,7 @@ define <8 x double> @test_8xdouble_perm_mask0(<8 x double> %vec) { define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [5,7,4,2,7,4,3,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] @@ -3837,7 +3837,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask0(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask0(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [5,7,4,2,7,4,3,4] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3893,7 +3893,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask1(<8 x double> %vec, <8 define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [7,5,5,5,3,5,1,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] @@ -3915,7 +3915,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask2(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask2(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [7,5,5,5,3,5,1,7] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -3984,7 +3984,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask3(<8 x double> %vec, <8 define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [3,5,3,4,6,5,7,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] @@ -4006,7 +4006,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask4(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask4(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [3,5,3,4,6,5,7,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4062,7 +4062,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask5(<8 x double> %vec, <8 define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { ; GENERIC-LABEL: test_8xdouble_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm1 = [2,7,6,4,0,0,0,2] sched: [7:0.50] ; GENERIC-NEXT: vpermpd %zmm0, %zmm1, %zmm0 # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4077,7 +4077,7 @@ define <8 x double> @test_8xdouble_perm_mask6(<8 x double> %vec) { define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm3 = [2,7,6,4,0,0,0,2] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm2, %zmm2, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm3, %zmm1 {%k1} # sched: [1:1.00] ; GENERIC-NEXT: vmovapd %zmm1, %zmm0 # sched: [1:1.00] @@ -4099,7 +4099,7 @@ define <8 x double> @test_masked_8xdouble_perm_mask6(<8 x double> %vec, <8 x dou define <8 x double> @test_masked_z_8xdouble_perm_mask6(<8 x double> %vec, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,7,6,4,0,0,0,2] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd %zmm0, %zmm2, %zmm0 {%k1} {z} # sched: [1:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4155,7 +4155,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mask7(<8 x double> %vec, <8 define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [0,3,4,0,4,2,0,1] sched: [7:0.50] ; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4171,7 +4171,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask0(<8 x double>* %vp) { define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [0,3,4,0,4,2,0,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4192,7 +4192,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x define <8 x double> @test_masked_z_8xdouble_perm_mem_mask0(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [0,3,4,0,4,2,0,1] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4251,7 +4251,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask1(<8 x double>* %vp define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [6,7,2,7,7,6,2,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4272,7 +4272,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x define <8 x double> @test_masked_z_8xdouble_perm_mem_mask2(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [6,7,2,7,7,6,2,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4345,7 +4345,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask3(<8 x double>* %vp define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [1,1,3,5,6,0,6,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4366,7 +4366,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x define <8 x double> @test_masked_z_8xdouble_perm_mem_mask4(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask4: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [1,1,3,5,6,0,6,0] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4425,7 +4425,7 @@ define <8 x double> @test_masked_z_8xdouble_perm_imm_mem_mask5(<8 x double>* %vp define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { ; GENERIC-LABEL: test_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [6:0.50] +; GENERIC-NEXT: vmovaps {{.*#+}} zmm0 = [2,4,0,4,6,1,2,5] sched: [7:0.50] ; GENERIC-NEXT: vpermpd (%rdi), %zmm0, %zmm0 # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4441,7 +4441,7 @@ define <8 x double> @test_8xdouble_perm_mem_mask6(<8 x double>* %vp) { define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x double> %vec2, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm2 = [2,4,0,4,6,1,2,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm2, %zmm0 {%k1} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -4462,7 +4462,7 @@ define <8 x double> @test_masked_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x define <8 x double> @test_masked_z_8xdouble_perm_mem_mask6(<8 x double>* %vp, <8 x i64> %mask) { ; GENERIC-LABEL: test_masked_z_8xdouble_perm_mem_mask6: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [6:0.50] +; GENERIC-NEXT: vmovapd {{.*#+}} zmm1 = [2,4,0,4,6,1,2,5] sched: [7:0.50] ; GENERIC-NEXT: vptestnmq %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpermpd (%rdi), %zmm1, %zmm0 {%k1} {z} # sched: [8:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5443,7 +5443,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mask3(<64 x i8> %vec, <64 x i8> %mask define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) { ; GENERIC-LABEL: test_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5459,7 +5459,7 @@ define <64 x i8> @test_64xi8_perm_mem_mask0(<64 x i8>* %vp) { define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5480,7 +5480,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %ve define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask0: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[0,9,15,13,11,11,3,12,4,1,7,5,2,6,14,6,23,27,24,18,30,23,28,22,28,22,19,19,31,25,16,22,35,33,34,32,42,34,41,41,43,40,36,46,37,39,42,40,63,63,62,62,57,55,59,51,52,48,50,48,58,50,60,58] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5501,7 +5501,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask0(<64 x i8>* %vp, <64 x i8> % define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5522,7 +5522,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %ve define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask1: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[15,6,14,7,5,1,14,12,5,7,5,0,0,5,3,8,19,19,26,27,20,29,20,21,27,16,30,17,23,27,16,28,47,39,33,33,33,44,38,46,39,33,38,44,45,32,34,39,50,61,62,53,54,56,52,56,51,52,55,57,56,52,51,49] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5543,7 +5543,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask1(<64 x i8>* %vp, <64 x i8> % define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5564,7 +5564,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %ve define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask2: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[12,1,11,3,4,11,10,11,8,13,1,10,1,11,5,10,27,26,19,29,19,24,26,19,26,20,18,28,24,21,25,16,34,38,47,40,33,44,44,44,41,43,35,43,45,44,37,41,58,62,49,61,56,53,55,48,51,58,58,55,63,55,53,61] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5585,7 +5585,7 @@ define <64 x i8> @test_masked_z_64xi8_perm_mem_mask2(<64 x i8>* %vp, <64 x i8> % define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) { ; GENERIC-LABEL: test_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm0 # sched: [7:0.50] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -5601,7 +5601,7 @@ define <64 x i8> @test_64xi8_perm_mem_mask3(<64 x i8>* %vp) { define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %vec2, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm2 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm1, %zmm1, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} = zmm2[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] @@ -5622,7 +5622,7 @@ define <64 x i8> @test_masked_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %ve define <64 x i8> @test_masked_z_64xi8_perm_mem_mask3(<64 x i8>* %vp, <64 x i8> %mask) { ; GENERIC-LABEL: test_masked_z_64xi8_perm_mem_mask3: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [6:0.50] +; GENERIC-NEXT: vmovdqa64 (%rdi), %zmm1 # sched: [7:0.50] ; GENERIC-NEXT: vptestnmb %zmm0, %zmm0, %k1 # sched: [1:0.33] ; GENERIC-NEXT: vpshufb {{.*#+}} zmm0 {%k1} {z} = zmm1[4,9,11,13,12,6,0,0,11,15,5,7,11,10,4,10,20,21,24,27,18,16,26,16,16,19,26,17,16,31,22,30,35,38,37,34,37,47,43,38,38,36,40,43,42,39,32,46,54,54,48,50,61,56,59,50,53,61,61,51,48,60,50,60] sched: [8:0.50] ; GENERIC-NEXT: retq # sched: [1:1.00] diff --git a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s index 1bf54d70d0c..955a421263e 100644 --- a/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/SandyBridge/resources-avx1.s @@ -1205,7 +1205,7 @@ vzeroupper # CHECK-NEXT: 1 1 1.00 vinsertps $1, %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 2 7 1.00 * vinsertps $1, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 6 0.50 * vlddqu (%rax), %xmm2 -# CHECK-NEXT: 1 6 0.50 * vlddqu (%rax), %ymm2 +# CHECK-NEXT: 1 7 0.50 * vlddqu (%rax), %ymm2 # CHECK-NEXT: 4 5 1.00 * * * vldmxcsr (%rax) # CHECK-NEXT: 1 1 1.00 * * * vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 3 8 1.00 * vmaskmovpd (%rax), %xmm0, %xmm2 @@ -1289,7 +1289,7 @@ vzeroupper # CHECK-NEXT: 1 1 1.00 * vmovntdq %xmm0, (%rax) # CHECK-NEXT: 1 1 1.00 * vmovntdq %ymm0, (%rax) # CHECK-NEXT: 1 6 0.50 * vmovntdqa (%rax), %xmm2 -# CHECK-NEXT: 1 6 0.50 * vmovntdqa (%rax), %ymm2 +# CHECK-NEXT: 1 7 0.50 * vmovntdqa (%rax), %ymm2 # CHECK-NEXT: 1 1 1.00 * vmovntpd %xmm0, (%rax) # CHECK-NEXT: 1 1 1.00 * vmovntpd %ymm0, (%rax) # CHECK-NEXT: 1 1 1.00 * vmovntps %xmm0, (%rax) |

